Example #1
0
	def transform(self, X, y=None):
		X_transformed = []  # will contain the counts of words in emails
		for email in X:
			text = email_to_text(email) or ""
			if self.lower_case:
				text = text.lower()
			if self.replace_urls:
				url_extractor = urlextract.URLExtract()
				urls = list(set(url_extractor.find_urls(text)))
				urls.sort(key=lambda url: len(url), reverse=True)
				for url in urls:
				    text = text.replace(url, " URL ")
			if self.replace_numbers:
			    text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
			if self.remove_punctuation:
			    text = re.sub(r'\W+', ' ', text, flags=re.M)
			word_counts = Counter(text.split())
			if self.stemming:
				stemmer = nltk.PorterStemmer()
				stemmed_word_counts = Counter()
				for word, count in word_counts.items():
				    stemmed_word = stemmer.stem(word)
				    stemmed_word_counts[stemmed_word] += count
				word_counts = stemmed_word_counts
			X_transformed.append(word_counts)

		return np.array(X_transformed)
Example #2
0
def _extract(text, regex=None):
    """
    Extract all the uris from the given text.
    """
    text = text.replace('=\n', '')
    if regex is not None:
        return re.findall(regex, text)

    extractor = urlextract.URLExtract()
    extractor.extract_email = True
    return extractor.find_urls(text, only_unique=True)
Example #3
0
    def __init__(self, config: perlink.config.Configuration) -> None:
        super().__init__(">!")
        self.id = config.bot_id
        self._config = config
        self._db = perlink.db.Database(self)
        logger.info(f"Bot created: {self.id}")

        self._url_extractor = urlextract.URLExtract(
            extract_email=False,
            extract_localhost=False,
        )
Example #4
0
    def __init__(self):
        
        self.stopwords = stopwords.words('english')

        #self.ps = PorterStemmer()  
        self.lm = WordNetLemmatizer()
        # stemmer will be used for each unique word once
        #self.stemmed = dict()
        self.lemmetized = dict()

        self.url_extractor = urlextract.URLExtract()
Example #5
0
 def __init__(self):
     self.url_extractor = urlextract.URLExtract()
     self.url_extractor.update()
     self.tag_regex = re.compile(r"<[^>]*>")
     self.email_regex = re.compile(r"[^\s]+@[^\s]+")
     self.number_regex = re.compile(r'\d+(?:\.\d*(?:[eE]\d+))?')
     self.spaces_regex = re.compile(r"\s+")
     self.special_chars = [
         "<", "[", "]", "`", "^", ">", "+", "?", "!", "'", ".", ",", ":",
         "*", "%", "#", "_", "=", "-", "&", '/', '\\', '(', ')', ";", "\"",
         "«", "»", "|", "•", "—", "–", "●", "►", "\n", "@", "$"
     ]
Example #6
0
    def __init__(self, includeSubject=True, stripNumbers=True, stripStopWords=True):
        """Initialiser

        Args:
            includeSubject (bool, optional): Include the subject as well as the email body. Defaults to True.
            stripNumbers (bool, optional): Strip numbers from text, replacing them with "NUMBER". Defaults to True.
            stripStopWords (bool, optional): Strip stop words from the text. Defaults to True.
        """
        self.url_extractor = urlextract.URLExtract()
        self.stemmer = nltk.PorterStemmer()
        self.includeSubject = includeSubject
        self.stripNumbers = stripNumbers
        self.stripStopWords = stripStopWords
Example #7
0
 def __init__(self,
              strip_headers=True,
              lower_case=True,
              remove_punctuation=True,
              replace_urls=True,
              replace_numbers=True,
              stemming=True):
     self.strip_headers = strip_headers
     self.lower_case = lower_case
     self.remove_punctuation = remove_punctuation
     self.replace_urls = replace_urls
     self.replace_numbers = replace_numbers
     self.stemming = stemming
     self.url_extractor = urlextract.URLExtract()
     self.stemmer = nltk.PorterStemmer()
    def __init__(self):
        self.authors_dict = None
        self.publications_dict = None
        self.authors_pubs = dict()
        self.titles_list = []
        self.venues_dict = dict()
        self.years_dict = dict()
        self.auth_id_ind = dict()
        self.pub_id_ind = dict()

        self.__lang_predictor = langid.classify
        self.__lang_translator = Translator()
        self.__stop_words = stopwords.words('english')
        self.__url_extractor = urlextract.URLExtract()
        self.__tokenizer = RegexpTokenizer(r'\w+')
        self.__stemmer = PorterStemmer()
        nltk.download('stopwords')

        self.__attr_names = [
            "authors", "coAuthors", "publications", "titles", "venues", "years"
        ]
Example #9
0
 def transform(self, X, y=None):
     X_transformed = []
     for article in X:
         text = " ".join(article) if self.include_subj else " ".join(
             article[1:])
         if self.replace_html:
             text = self.__html_to_plain_text__(text)
         if self.replace_urls:
             url_extractor = urlextract.URLExtract()
             urls = list(set(url_extractor.find_urls(text)))
             urls.sort(key=lambda url: len(url), reverse=True)
             for url in urls:
                 text = text.replace(url, " URL ")
         if self.replace_numbers:
             text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', ' NUMBER ', text)
         if self.remove_punctuation:
             text = text.replace("\'", "").replace(
                 "’",
                 "")  #Because we dont want these to be replaced by spaces
             text = re.sub(r'\W+', ' ', text, flags=re.M)
         X_transformed.append(text)
     return X_transformed
Example #10
0
def user(userID,per):
    # stopwords = stopwords.words('english')
    extractor = urlextract.URLExtract()
    translator = Translator()
    # stopwords = stopwords.words('english')


    # user_id=[]
    # with open('users.csv', 'rb') as fil:
    #     user = csv.reader(fil)
    #     for row in user:
    #         user_id.append(row[2]) 

    #intilization
    # total_posts=0
    # total_postsA=0

    message=[]
    messageA=[]

    total_posts_per_year=0
    total_posts_per_yearA=0

    shared=0
    added=0
    posted=0
    updated=0 
    sharedA=0
    addedA=0
    postedA=0
    updatedA=0 

    langu=[]
    languA=[]

    months=[0]*12
    monthsA=[0]*12


    daysX_monthsY=np.zeros([12,31])
    daysX_monthsYA=np.zeros([12,31])

    postLength=[]
    average_nbr_words=0
    urls_Size=0
    average_nbr_links=0
    average_nbr_posts_month=0

    # sharedRatio=0
    # updateRatio=0
    # addRatio=0
    # postRatio=0
    # sharedRatioA=0
    # updateRatioA=0
    # addRatioA=0
    # postRatioA=0

    season=[0]*4 # winter spring summer autumn
    seasonA=[0]*4 # winter spring summer autumn

    hashTags=[] # store hashtags used
    number_hash=0

    weekends=0
    postsWeekends52=0
    postsWeekends51=0
    postsWeekends50=0
    weekendsA=0
    postsWeekends52A=0
    postsWeekends51A=0
    postsWeekends50A=0

    average_posts_weekEnd=0
    average_posts_summer=0
    average_posts_winter=0
    average_posts_spring=0
    average_posts_autumn=0

    tophash=[]
    tags=[]
    tagnbr=[]
    tagsA=[]
    tagnbrA=[]
    taggedposts=0
    taggedpostsA=0


    weekposts52=0
    weekposts51=0
    weekposts50=0
    weekposts52A=0
    weekposts51A=0
    weekposts50A=0

    daysweek=np.zeros([3,7])*4
    daysweekA=np.zeros([3,7])

    activityM52=0    
    activityN52=0
    activityM51=0
    activityN51=0 
    activityM50=0
    activityN50=0

    activityM52A=0    
    activityN52A=0
    activityM51A=0
    activityN51A=0 
    activityM50A=0
    activityN50A=0 

    url=[]
    urlA=[]
    urlSize=0
    urlSizeA=0

    user=0
    userA=0
    # shows whether they post M(1)or N(0) or 2 Both
    personalityActivityTime=[1,1,0,1,0,1,2,2,1,1,1,1,0,0,0,2,0,1,2,1,1,0,1]
    # shows whether they  none 0 or add 1 share 2 update 3 post 4
    personalityTypePost=[2,0,2,2,0,2,0,0,4,0,2,4,2,2,4,2,2,2,2,0,2,0,0]
    # shows how many times they post none 0 or hour 1 couples of day 2 once per day 3 rarely 4
    personalityDay=[4,4,3,4,2,4,1,4,2,4,4,4,4,4,2,4,3,4,4,4,4,4]

    path='user_posts_'+userID+'.csv'
    pathA='user_posts_'+userID+'.csv'

    def get_wordnet_pos(treebank_tag):
        
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return  wordnet.NOUN

    def lima(word, words):

        lemmatiser = WordNetLemmatizer()
        words_tag = dict(pos_tag(words))
        return  lemmatiser.lemmatize(word, get_wordnet_pos(words_tag.get(word)))


    def clean(words):
        # words = re.sub('[^a-zA-Z]', '', words.lower()).split()
        tknzr = TweetTokenizer()
        # tokenizer = RegexpTokenizer('\w+|\S+')
        # words=nltk.word_tokenize(words.lower())
        words = tknzr.tokenize(words)
        exclude = set(string.punctuation)
        words2 = [word for word in words if
                not word in exclude]
        words_tag = dict(pos_tag(words))
        words = [word.lower() for word in words2 if
                not word in nltk.corpus.stopwords.words('english') and not word.isdigit()]
        # print(words)
        words = [lima(word, words) for word in words]
        # print(words)
        words = ' '.join(words)
        # print(words)
        return words


    def display_topics(model, feature_names, no_top_words):
        for topic_idx, topic in enumerate(model.components_):
            l= "Topic %d:" % (topic_idx)
            l= " ".join([feature_names[i].encode("utf-8")
                            for i in topic.argsort()[:-no_top_words - 1:-1]])
            tophash.append(feature_names[i].encode("utf-8"))


    def topic_hash(hashtags):
        vectorizer = TfidfVectorizer(min_df=0.2,stop_words='english')
        X = vectorizer.fit_transform(hashtags)
        no_topics = min(10,len(hashtags))
        nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(X)
        display_topics(nmf, vectorizer.get_feature_names(), 1)



    def extract_hash(word):
        i=0
        h=''
        while(i<len(word)):
            s=word[i]
            if(s=='#'):
                i+=1
                while(i<len(word) and not(word[i]=='#' or word[i]==' ' or word[i]=="\n")):
                    s=word[i]
                    h+=s
                    i+=1
                hashTags.append(h)
                h=''
            else:
                i+=1


    def get_season(now):
        if isinstance(now, datetime):
            now = now.date()
        return next(s for s, (start, end) in seasons
                    if start <= now <= end)

    def get_tags(tag):
        tag=tag.replace('u','')
        tag=tag.replace('[','')
        tag=tag.replace(']','')
        tag=tag.replace('\'','')
        tag= (tag.split(','))
        tagnbr.append(len(tag))
        i=0
        while(i<len(tag)):
            if(int(tag[i]) not in tags):
                tags.append(int(tag[i]))
            i+=1
        return tags

    def get_tags_anoamly(tagA):
        tagL=tagA.replace('u','')
        tagL=tagL.replace('[','')
        tagL=tagL.replace(']','')
        tagL=tagL.replace('\'','')
        tagL= (tagL.split(','))
        tagnbrA.append(len(tagL))
        l=0
        while(l<len(tagL)):
            if(int(tagL[l]) not in tagsA):
                tagsA.append(int(tagL[l]))
                
            l+=1
        return tagsA
        
                
    with open(path, 'rb') as f:
        posts = csv.reader(f)
        for items in posts:
            # check date time
            datetime_object = datetime.strptime(items[3],"%Y-%m-%d %H:%M:%S")    
            hour=datetime_object.hour
            month=datetime_object.month
            year=datetime_object.year
            day=datetime_object.day
            dates=datetime.date(datetime_object)
            
            seasons = [('winter', (date(year,  1,  1),  date(year,  3, 20))),
                        ('spring', (date(year,  3, 21),  date(year,  6, 20))),
                        ('summer', (date(year,  6, 21),  date(year,  9, 22))),
                        ('autumn', (date(year,  9, 23),  date(year, 12, 20))),
                        ('winter', (date(year, 12, 21),  date(year, 12, 31)))]
        
                
            # activity in 2017 
            if(year==2017):
                
                total_posts_per_year+=1
                # posts/month
                months[month-1]+=1
                # posts /day
                daysX_monthsY[month-1][day-1]+=1
                # week number
                weekNumber = dates.isocalendar()[1]
                # activity of last 3 weeks in 2017
                # total nbr of posts/each week
                # posts/day in each week
                # activity time in each day in each week
                if(weekNumber==51):
                    weekend=datetime_object.weekday()
                    if(weekend==4 or weekend==5):
                        # weekends+=1
                        postsWeekends52+=1
                    weekposts52+=1
                    daysweek[0][(dates.weekday())-1]+=1
                    if(hour>=6 and hour<18):
                        activityM52+=1
                    elif(hour>=18 and hour <24):
                        activityN52+=1
                    elif(hour >=0 and hour <6):
                        activityN52+=1

                if(weekNumber==50):
                    weekend=datetime_object.weekday()
                    if(weekend==4 or weekend==5):
                        # weekends+=1
                        postsWeekends51+=1
                    weekposts51+=1
                    if(hour>=6 and hour<18):
                        activityM51+=1
                    elif(hour>=18 and hour <24):
                        activityN51+=1
                    elif(hour >=0 and hour <6):
                        activityN51+=1
                    daysweek[1][(dates.weekday())-1]+=1

                if(weekNumber==49):
                    weekend=datetime_object.weekday()
                    if(weekend==4 or weekend==5):
                        # weekends+=1
                        postsWeekends50+=1
                    weekposts50+=1
                    if(hour>=6 and hour<18):
                        activityM50+=1
                    elif(hour>=18 and hour <24):
                        activityN50+=1
                    elif(hour >=0 and hour <6):
                        activityN50+=1
                    daysweek[2][(dates.weekday())-1]+=1
                # which type they use most share/add/update/post
                if(items[4]=='added'):
                    added+=1
                elif(items[4]=='updated'):
                    updated+=1
                elif(items[4]=='posted'):
                    posted+=1
                else:
                    shared+=1
                
            # season
                if(get_season(datetime_object)=='winter'):
                    season[0]+=1
                elif(get_season(datetime_object)=='spring'):
                    season[1]+=1
                elif(get_season(datetime_object)=='summer'):
                    season[2]+=1
                elif(get_season(datetime_object)=='autumn'):
                    season[3]+=1
                weekends= len([1 for i in calendar.monthcalendar(2017,
                                    12) if i[5] != 0])

                weekends+=len([1 for i in calendar.monthcalendar(2017,
                                    12) if i[4] != 0])


                        
                
                # lang detector
                if(items[0]):
                    t=translator.detect(json.dumps(items[0].decode('utf-8')))
                    langu.append(t.lang)
                    # msg=items[0]
                    # if(t.lang=="en"):
                    #     message.append(clean(msg).encode('utf-8'))
                # tags
                k=items[5]
                if(len(k)>2):
                    taggedposts+=1
                    get_tags(k)

                urls = extractor.find_urls(items[0])
                url.append(urls)
                urlSize+=len(urls)
            
            # nbr of words
            # word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
            # mess=items[0]
            # mess = unicode(mess, errors='ignore')
            # words1 = word_tokenizer.tokenize(mess.lower())
            # words=[word for word in words1 if
            #     not word in stopwords]
            # postLength.append(len(words))

            # nbr of urls
            
            
            
            
            # extract hashTags
            extract_hash(items[0])

            # count hash tags
            number_hash+=items[0].count('#')



    #  anomaly user
    with open(pathA, 'rb') as files:
        postsA = csv.reader(files)
        for anomaly in postsA:
            # check date time
            datetime_object = datetime.strptime(anomaly[3],"%Y-%m-%d %H:%M:%S")    
            hour=datetime_object.hour
            month=datetime_object.month
            year=datetime_object.year
            day=datetime_object.day
            dates=datetime.date(datetime_object)
            
            seasons = [('winter', (date(year,  1,  1),  date(year,  3, 20))),
                        ('spring', (date(year,  3, 21),  date(year,  6, 20))),
                        ('summer', (date(year,  6, 21),  date(year,  9, 22))),
                        ('autumn', (date(year,  9, 23),  date(year, 12, 20))),
                        ('winter', (date(year, 12, 21),  date(year, 12, 31)))]
        
                
            # activity in 2017 
            if(year==2017):
                
                total_posts_per_yearA+=1
                # posts/month
                monthsA[month-1]+=1
                # posts /day
                daysX_monthsYA[month-1][day-1]+=1
                # week number
                weekNumber = dates.isocalendar()[1]
                # activity of last 3 weeks in 2017
                # total nbr of posts/each week
                # posts/day in each week
                # activity time in each day in each week
                if(weekNumber==51):
                    weekend=datetime_object.weekday()
                    if(weekend==4 or weekend==5):
                        # weekendsA+=1
                        postsWeekends52A+=1
                    weekposts52A+=1
                    daysweekA[0][(dates.weekday())-1]+=1
                    if(hour>=6 and hour<18):
                        activityM52A+=1
                    elif(hour>=18 and hour <24):
                        activityN52A+=1
                    elif(hour >=0 and hour <6):
                        activityN52A+=1
                if(weekNumber==50):
                    weekend=datetime_object.weekday()
                    if(weekend==4 or weekend==5):
                        # weekendsA+=1
                        postsWeekends51A+=1
                    weekposts51A+=1
                    if(hour>=6 and hour<18):
                        activityM51A+=1
                    elif(hour>=18 and hour <24):
                        activityN51A+=1
                    elif(hour >=0 and hour <6):
                        activityN51A+=1
                    daysweekA[1][(dates.weekday())-1]+=1

                if(weekNumber==49):
                    weekend=datetime_object.weekday()
                    if(weekend==4 or weekend==5):
                        # weekendsA+=1
                        postsWeekends50A+=1
                    weekposts50A+=1
                    if(hour>=6 and hour<18):
                        activityM50A+=1
                    elif(hour>=18 and hour <24):
                        activityN50A+=1
                    elif(hour >=0 and hour <6):
                        activityN50A+=1
                    daysweekA[2][(dates.weekday())-1]+=1

                # which type they use most share/add/update/post
                if(anomaly[4]=='added'):
                    addedA+=1
                elif(anomaly[4]=='updated'):
                    updatedA+=1
                elif(anomaly[4]=='posted'):
                    postedA+=1
                else:
                    sharedA+=1
                
            # season
                if(get_season(datetime_object)=='winter'):
                    seasonA[0]+=1
                elif(get_season(datetime_object)=='spring'):
                    seasonA[1]+=1
                elif(get_season(datetime_object)=='summer'):
                    seasonA[2]+=1
                elif(get_season(datetime_object)=='autumn'):
                    seasonA[3]+=1
                
                weekendsA= len([1 for i in calendar.monthcalendar(2017,
                                    12) if i[5] != 0])

                weekendsA+=len([1 for i in calendar.monthcalendar(2017,
                                    12) if i[4] != 0])
                


                # tags
                kA=anomaly[5]
                if(len(kA)>2):
                    taggedpostsA+=1
                    get_tags_anoamly(kA)
            
                        
                
                
                
                # lang detector
                tA=translator.detect(json.dumps(anomaly[0].decode('utf-8')))
                languA.append(tA.lang)

                # msgA=anomaly[0]
                # if(tA.lang=='en'):
                #     messageA.append(clean(msgA).encode('utf-8'))

                urlsA = extractor.find_urls(items[0])
                urlA.append(urlsA)
                urlSizeA+=len(urlsA)
                



            # # nbr of words
            # word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
            # mess=anomaly[0]
            # mess = unicode(mess, errors='ignore')
            # words1 = word_tokenizer.tokenize(mess.lower())
            # words=[word for word in words1 if
            #     not word in stopwords]
            # postLength.append(len(words))

            # nbr of urls
            urls = extractor.find_urls(anomaly[0])
            urls_Size+=len(urls)
            
            
            
            # extract hashTags
            extract_hash(anomaly[0])

            # count hash tags
            number_hash+=anomaly[0].count('#')

    scoreA1=2
    scoreB1=2
    import topicModel
    scoreTA,scoreTB= topicModel.topic(userID,userID,scoreA1,scoreB1)


    import lang
    scoreLA,scoreLB= lang.language(langu, languA,total_posts_per_year,total_posts_per_yearA,scoreTA,scoreTB)
    # print added,updated,posted,shared
    # print addedA,updatedA,postedA,sharedA


    import typeOfPost
    scorePA,scorePB= typeOfPost.type_post(added,shared,updated,posted,total_posts_per_year,addedA,sharedA,updatedA,postedA,total_posts_per_yearA,personalityTypePost[per],scoreLA,scoreLB)

    
    import tagged

    scoreTagA,scoreTagB= tagged.tagged(tags,tagnbr,taggedposts,total_posts_per_year,tagsA,tagnbrA,taggedpostsA,total_posts_per_yearA,scorePA,scorePB)

    import freq

    scoreFA,scoreFB= freq.frequency(postsWeekends52,postsWeekends51,postsWeekends50,weekends,postsWeekends52A,postsWeekends51A,
    postsWeekends50A,weekendsA,activityM52, activityM51, activityM50, activityN52, activityN51, activityN50,
    activityM52A, activityM51A, activityM50A, activityN52A, activityN51A, activityN50A, personalityActivityTime[per]
    ,daysweek, daysweekA,season,total_posts_per_year,seasonA,total_posts_per_yearA,weekposts52,weekposts51,weekposts50,weekposts52A,
    weekposts51A,weekposts50A,personalityDay[per],scoreTagA,scoreTagB)

    # import links

    # links.link(url, urlSize, total_posts_per_year, urlA, urlSizeA, total_posts_per_yearA)

    return scoreFA,scoreFB
Example #11
0
import threading
import time
from typing import Dict, List, NoReturn, Optional, Tuple

import ircstyle
import miniirc
import urlextract

from . import config
from .title import url_title_reader
from .util.urllib import validate_parsed_url

PUNCTUATION = tuple(string.punctuation)

log = logging.getLogger(__name__)
url_extractor = urlextract.URLExtract()  # pylint: disable=invalid-name


def _alert(irc: miniirc.IRC, msg: str, loglevel: int = logging.ERROR) -> None:
    log.log(loglevel, msg)
    irc.msg(config.INSTANCE["alerts_channel"], msg)


class Bot:
    """Bot."""

    EXECUTORS: Dict[str, concurrent.futures.ThreadPoolExecutor] = {}
    QUEUES: Dict[str, queue.SimpleQueue] = {}

    def __init__(self) -> None:
        log.info("Initializing bot as: %s",
def parallel_file_extraction_worker(file):
    extractor = urlextract.URLExtract()
    with open(file) as f:
        return list(map(lambda x: (file, x),
                        extractor.find_urls(f.read())))
Example #13
0
def check_for_emails():
    threading.Timer(10, check_for_emails).start()

    mail = imaplib.IMAP4_SSL("imap.gmail.com")
    mail.login("*****@*****.**",
               open("password.txt", "r").read().strip())
    for folder in ["\"[Gmail]/Kaikki viestit\"", "\"[Gmail]/Roskaposti\""]:
        res = mail.select(folder)

        status, data = mail.search(None, "UnSeen")

        mail_ids = []

        for block in data:
            mail_ids += block.split()

        if len(mail_ids) > 0:
            print(str(len(mail_ids)) + " new emails")

        for i in mail_ids:
            status, data = mail.fetch(i, "(RFC822)")

            for response_part in data:
                if isinstance(response_part, tuple):

                    message = email.message_from_bytes(response_part[1])

                    mail_from_original = message["from"]
                    mail_subject = message["subject"]

                    mail_time = dparser.parse(
                        message["received"].split("\n")[1].strip(),
                        fuzzy=True).astimezone().strftime("%d/%m/%y %H %p")

                    sender_search = re.search(r"[\w\.-]+@[\w\.-]+\.\w+",
                                              mail_from_original)
                    if sender_search:
                        mail_from_original = sender_search.group(0)

                    if message.is_multipart():
                        mail_content = ""

                        for part in message.get_payload():
                            if part.get_content_type() in [
                                    "text/plain", "text/html",
                                    "multipart/alternative"
                            ]:
                                if type(part.get_payload()) == list:
                                    for m in part.get_payload():
                                        tmp = m.as_string()
                                        tmp = tmp.split("\n\n", 1)[1]

                                        try:
                                            tmp = base64.b64decode(tmp).decode(
                                                "utf-8")
                                        except:
                                            pass
                                        mail_content += tmp
                                        break
                                else:
                                    mail_content += part.get_payload()
                    else:
                        mail_content = message.get_payload()

                    try:
                        mail_content = base64.b64decode(mail_content).decode(
                            "utf-8")
                    except:
                        pass

                    mail_from = mail_from_original
                    if "Subject: " in mail_content or "From: " in mail_content:
                        for l in mail_content.split("\n"):
                            if l.startswith("Subject: "):
                                mail_subject = l[len("Subject: "):]
                            if l.startswith("From: "):
                                mail_from = l[len("From: "):]
                            if l.startswith("Von: "):
                                mail_from = l[len("Von: "):]

                    mail_subject = mail_subject.lower()
                    while mail_subject.startswith(
                            "fw: ") or mail_subject.startswith("fwd: "):
                        if mail_subject.startswith("fw: "):
                            mail_subject = mail_subject[4:]
                        if mail_subject.startswith("fwd: "):
                            mail_subject = mail_subject[5:]

                    sender_search = re.search(r"[\w\.-]+@[\w\.-]+\.\w+",
                                              mail_from)
                    if sender_search:
                        mail_from = sender_search.group(0)

                    links = urlextract.URLExtract().find_urls(mail_content)

                    for i in range(len(links)):
                        links[i] = urlparse(links[i]).hostname

                    links = list(set(links))

                    for i in range(len(links) - 1, 0, -1):
                        if links[i] is None or len(links[i]) == 0:
                            del links[i]

                    mailObject = {
                        "reporter": mail_from_original,
                        "subject": mail_subject,
                        "links": links,
                        "time": mail_time
                    }
                    add_message(mail_from, mailObject)
Example #14
0
def extract_urls(
    text: str,
    log: logging.Logger,
    examples: bool = False,
    common: bool = False,
) -> set:
    """
    Return a list of URLs in a text string.

    @param      text      The text to extract URLs from
    @param      log       The log
    @param      examples  Include example URLs
    @param      common    Include URLs that are common in IETF documents

    @return     List of URLs.
    """

    # find all URLs
    extractor = urlextract.URLExtract()
    extractor.update_when_older(7)  # update TLDs when older than 7 days
    text = unfold(text)
    urls = []
    for url in extractor.gen_urls(text):
        url = url.rstrip(".\"]'>;,")
        # if not re.search(r"://", url):
        #     url = "http://" + url
        if re.match(r"[\d\.:a-f]+", url, flags=re.IGNORECASE):
            # skip literal IP addresses
            continue
        try:
            urllib.parse.urlparse(url).netloc
        except ValueError as err:
            log.warning("%s: %s", err, url)
            continue
        urls.append(url)

    if not examples:
        # remove example URLs
        urls = [
            u for u in urls if not re.search(
                r"example\.(com|net|org)|\.example",
                urllib.parse.urlparse(u).netloc if urllib.parse.urlparse(u).
                netloc else u,
                re.IGNORECASE,
            )
        ]

    if not common:
        # remove some common URLs
        urls = [
            u for u in urls if not re.search(
                r"""https?://
                    datatracker\.ietf\.org/drafts/current/|
                    trustee\.ietf\.org/license-info|
                    (www\.)?rfc-editor\.org/info/rfc\d+|
                    (www\.)?ietf\.org/archive/id/draft-""",
                u,
                flags=re.VERBOSE | re.IGNORECASE,
            )
        ]

    return set(urls)
Example #15
0
 def __init__(self, client):
     self.client = client
     self.extractor = urlextract.URLExtract()
     self.command_handler = CommandHandler.CommandHandler(client)
Example #16
0
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except:
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)


urlextract = urlextract.URLExtract()
stemmer = nltk.PorterStemmer()


class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,
                 strip_header=True,
                 lower_case=True,
                 remove_punctuation=True,
                 replace_urls=True,
                 replace_number=True,
                 stemming=True):
        self.strip_header = strip_header
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
Example #17
0
    try:
        import nltk  # Natural Language Toolkit

        stemmer = nltk.PorterStemmer()  # suffix stripper
        for word in ("Computations", "Computation", "Computing", "Computed",
                     "Compute", "Compulsive"):
            print(word, "=>", stemmer.stem(
                word))  # output:comput. It's a suffix stripping function.
    except ImportError:
        print("Error: stemming requires the NLTK module.")
        stemmer = None

    try:
        import urlextract  # may require an Internet connection to download root domain names

        url_extractor = urlextract.URLExtract(
        )  # a tool which can find urls in a sentence.
        print(
            url_extractor.find_urls(
                "Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s"
            ))
    except ImportError:
        print("Error: replacing URLs requires the urlextract module.")
        url_extractor = None

    print('-----preparation done-----')

    print('make classifier')

    X_few = X_train[:3]  # train sample
    X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(
        X_few)  # count emails' words
Example #18
0
def urlExtract(text):
    extr = urlextract.URLExtract()
    urls = extr.find_urls(text=text)
    return urls
Example #19
0
 def __init__(self, bot: commands.Bot):
     self.bot = bot
     self.urlextractor = urlextract.URLExtract()
Example #20
0
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import string
import uritools
import urlextract
from langdetect import detect
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

extractor = urlextract.URLExtract()
ps = PorterStemmer()



data = {'airline_sentiment':[],'text':[]}
airline_sentiment = []
corpus = []

with open('Tweets.csv', 'r', encoding='utf8') as f:
    tweets = csv.reader(f)
    for row in tweets:        
        data['airline_sentiment'].append(row[1])
        data['text'].append(row[10])
        corpus.append(row[10])
Example #21
0
def hsparser(Filecontent):
    if Filecontent:
        import wordsegment
        import re
        import urlextract
        ue = urlextract.URLExtract()
        #print(Filecontent)
        urls = ue.find_urls(Filecontent)
        for yu in urls:
            Filecontent = Filecontent.replace(yu, 'url')
        listofchar = [
            '_', '"', '\'', '“', '”', '’', '/', '\\', '-', ',', '`', '.', '>',
            '<', '?', '!', '#', '%', '&', '(', ')', '=', '+', '|', ';', ':',
            '~', ']', '[', '{', '}', '*', '^'
        ]
        for lis in listofchar:
            Filecontent = Filecontent.replace(lis, '')
        listofmoney = ['$', '£', '€']
        for lis in listofmoney:
            Filecontent = Filecontent.replace(lis, ' msign')
        s1 = Filecontent.split()
        brange = len(s1)
        b = 0
        while (b < brange):
            s1[b] = s1[b].lower()
            if (re.match(r"\S+@\S+", s1[b])):
                s1[b] = 'emailaddress'
            elif (re.match(r"http", s1[b])):
                s1[b] = 'url'
            elif (re.match(r"^msign[0-9]+$", s1[b])):
                #print("money")
                #print(s1[b])
                s1[b] = 'mamount'
            elif (re.match(r"^[0-9]+$", s1[b])):
                if (len(s1[b]) == 10):
                    #print("phone")
                    #print(s1[b])
                    s1[b] = 'pno'
                else:
                    s1.pop(b)
                    b = b - 1
                    brange = brange - 1
            elif (not re.match(r"^[0-9a-zA-Z]+$", s1[b])):
                #print("not english")
                #print(s1[b])
                s1.pop(b)
                b = b - 1
                brange = brange - 1
            elif (len(s1[b]) >= 20):
                #print("too long")
                #print(s1[b])
                s1[b] = 'emessage'
            b = b + 1
        result = ''
        i = 0
        i2 = 1
        #while i <= lent-1:
        # str = s1[i] + ' ' + s1[i+1]
        #try:
        #if wordsegment.BIGRAMS[str]>=1:
        #s1[i] = str
        #lent = lent - 1
        # s1.pop(i+1)
        #except Exception as inst:
        #  pass
        # i = i + 1
        #print(s1)
        #s1.extend(s2)
        return s1
    else:
        return []
Example #22
0
 def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
     super().__init__(component_config)
     self.extractor = urlextract.URLExtract()
Example #23
0
import urlextract

from ._html import get_html_hrefs

try:
    _url_extractor = urlextract.URLExtract(cache_dns=False)
except Exception:
    _url_extractor = urlextract.URLExtract()

MAX_URLS = 1000


def _url_extractor_base(content):
    try:
        return _url_extractor.gen_urls(content, check_dns=True)
    except Exception:
        return _url_extractor.gen_urls(content)


def _url_extractor_wrapper(content, url=None):
    extractor = _url_extractor_base(content)
    seen = set()
    for url in extractor:
        # package and cffi end up with repo``
        # https://github.com/lipoja/URLExtract/issues/13
        url = url.strip("`")

        if url.endswith(".html."):  # lit
            url = url[:-1]

        yield url