def transform(self, X, y=None): X_transformed = [] # will contain the counts of words in emails for email in X: text = email_to_text(email) or "" if self.lower_case: text = text.lower() if self.replace_urls: url_extractor = urlextract.URLExtract() urls = list(set(url_extractor.find_urls(text))) urls.sort(key=lambda url: len(url), reverse=True) for url in urls: text = text.replace(url, " URL ") if self.replace_numbers: text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text) if self.remove_punctuation: text = re.sub(r'\W+', ' ', text, flags=re.M) word_counts = Counter(text.split()) if self.stemming: stemmer = nltk.PorterStemmer() stemmed_word_counts = Counter() for word, count in word_counts.items(): stemmed_word = stemmer.stem(word) stemmed_word_counts[stemmed_word] += count word_counts = stemmed_word_counts X_transformed.append(word_counts) return np.array(X_transformed)
def _extract(text, regex=None): """ Extract all the uris from the given text. """ text = text.replace('=\n', '') if regex is not None: return re.findall(regex, text) extractor = urlextract.URLExtract() extractor.extract_email = True return extractor.find_urls(text, only_unique=True)
def __init__(self, config: perlink.config.Configuration) -> None: super().__init__(">!") self.id = config.bot_id self._config = config self._db = perlink.db.Database(self) logger.info(f"Bot created: {self.id}") self._url_extractor = urlextract.URLExtract( extract_email=False, extract_localhost=False, )
def __init__(self): self.stopwords = stopwords.words('english') #self.ps = PorterStemmer() self.lm = WordNetLemmatizer() # stemmer will be used for each unique word once #self.stemmed = dict() self.lemmetized = dict() self.url_extractor = urlextract.URLExtract()
def __init__(self): self.url_extractor = urlextract.URLExtract() self.url_extractor.update() self.tag_regex = re.compile(r"<[^>]*>") self.email_regex = re.compile(r"[^\s]+@[^\s]+") self.number_regex = re.compile(r'\d+(?:\.\d*(?:[eE]\d+))?') self.spaces_regex = re.compile(r"\s+") self.special_chars = [ "<", "[", "]", "`", "^", ">", "+", "?", "!", "'", ".", ",", ":", "*", "%", "#", "_", "=", "-", "&", '/', '\\', '(', ')', ";", "\"", "«", "»", "|", "•", "—", "–", "●", "►", "\n", "@", "$" ]
def __init__(self, includeSubject=True, stripNumbers=True, stripStopWords=True): """Initialiser Args: includeSubject (bool, optional): Include the subject as well as the email body. Defaults to True. stripNumbers (bool, optional): Strip numbers from text, replacing them with "NUMBER". Defaults to True. stripStopWords (bool, optional): Strip stop words from the text. Defaults to True. """ self.url_extractor = urlextract.URLExtract() self.stemmer = nltk.PorterStemmer() self.includeSubject = includeSubject self.stripNumbers = stripNumbers self.stripStopWords = stripStopWords
def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True, replace_urls=True, replace_numbers=True, stemming=True): self.strip_headers = strip_headers self.lower_case = lower_case self.remove_punctuation = remove_punctuation self.replace_urls = replace_urls self.replace_numbers = replace_numbers self.stemming = stemming self.url_extractor = urlextract.URLExtract() self.stemmer = nltk.PorterStemmer()
def __init__(self): self.authors_dict = None self.publications_dict = None self.authors_pubs = dict() self.titles_list = [] self.venues_dict = dict() self.years_dict = dict() self.auth_id_ind = dict() self.pub_id_ind = dict() self.__lang_predictor = langid.classify self.__lang_translator = Translator() self.__stop_words = stopwords.words('english') self.__url_extractor = urlextract.URLExtract() self.__tokenizer = RegexpTokenizer(r'\w+') self.__stemmer = PorterStemmer() nltk.download('stopwords') self.__attr_names = [ "authors", "coAuthors", "publications", "titles", "venues", "years" ]
def transform(self, X, y=None): X_transformed = [] for article in X: text = " ".join(article) if self.include_subj else " ".join( article[1:]) if self.replace_html: text = self.__html_to_plain_text__(text) if self.replace_urls: url_extractor = urlextract.URLExtract() urls = list(set(url_extractor.find_urls(text))) urls.sort(key=lambda url: len(url), reverse=True) for url in urls: text = text.replace(url, " URL ") if self.replace_numbers: text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', ' NUMBER ', text) if self.remove_punctuation: text = text.replace("\'", "").replace( "’", "") #Because we dont want these to be replaced by spaces text = re.sub(r'\W+', ' ', text, flags=re.M) X_transformed.append(text) return X_transformed
def user(userID,per): # stopwords = stopwords.words('english') extractor = urlextract.URLExtract() translator = Translator() # stopwords = stopwords.words('english') # user_id=[] # with open('users.csv', 'rb') as fil: # user = csv.reader(fil) # for row in user: # user_id.append(row[2]) #intilization # total_posts=0 # total_postsA=0 message=[] messageA=[] total_posts_per_year=0 total_posts_per_yearA=0 shared=0 added=0 posted=0 updated=0 sharedA=0 addedA=0 postedA=0 updatedA=0 langu=[] languA=[] months=[0]*12 monthsA=[0]*12 daysX_monthsY=np.zeros([12,31]) daysX_monthsYA=np.zeros([12,31]) postLength=[] average_nbr_words=0 urls_Size=0 average_nbr_links=0 average_nbr_posts_month=0 # sharedRatio=0 # updateRatio=0 # addRatio=0 # postRatio=0 # sharedRatioA=0 # updateRatioA=0 # addRatioA=0 # postRatioA=0 season=[0]*4 # winter spring summer autumn seasonA=[0]*4 # winter spring summer autumn hashTags=[] # store hashtags used number_hash=0 weekends=0 postsWeekends52=0 postsWeekends51=0 postsWeekends50=0 weekendsA=0 postsWeekends52A=0 postsWeekends51A=0 postsWeekends50A=0 average_posts_weekEnd=0 average_posts_summer=0 average_posts_winter=0 average_posts_spring=0 average_posts_autumn=0 tophash=[] tags=[] tagnbr=[] tagsA=[] tagnbrA=[] taggedposts=0 taggedpostsA=0 weekposts52=0 weekposts51=0 weekposts50=0 weekposts52A=0 weekposts51A=0 weekposts50A=0 daysweek=np.zeros([3,7])*4 daysweekA=np.zeros([3,7]) activityM52=0 activityN52=0 activityM51=0 activityN51=0 activityM50=0 activityN50=0 activityM52A=0 activityN52A=0 activityM51A=0 activityN51A=0 activityM50A=0 activityN50A=0 url=[] urlA=[] urlSize=0 urlSizeA=0 user=0 userA=0 # shows whether they post M(1)or N(0) or 2 Both personalityActivityTime=[1,1,0,1,0,1,2,2,1,1,1,1,0,0,0,2,0,1,2,1,1,0,1] # shows whether they none 0 or add 1 share 2 update 3 post 4 personalityTypePost=[2,0,2,2,0,2,0,0,4,0,2,4,2,2,4,2,2,2,2,0,2,0,0] # shows how many times they post none 0 or hour 1 couples of day 2 once per day 3 rarely 4 personalityDay=[4,4,3,4,2,4,1,4,2,4,4,4,4,4,2,4,3,4,4,4,4,4] path='user_posts_'+userID+'.csv' pathA='user_posts_'+userID+'.csv' def get_wordnet_pos(treebank_tag): if treebank_tag.startswith('J'): return wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.ADV else: return wordnet.NOUN def lima(word, words): lemmatiser = WordNetLemmatizer() words_tag = dict(pos_tag(words)) return lemmatiser.lemmatize(word, get_wordnet_pos(words_tag.get(word))) def clean(words): # words = re.sub('[^a-zA-Z]', '', words.lower()).split() tknzr = TweetTokenizer() # tokenizer = RegexpTokenizer('\w+|\S+') # words=nltk.word_tokenize(words.lower()) words = tknzr.tokenize(words) exclude = set(string.punctuation) words2 = [word for word in words if not word in exclude] words_tag = dict(pos_tag(words)) words = [word.lower() for word in words2 if not word in nltk.corpus.stopwords.words('english') and not word.isdigit()] # print(words) words = [lima(word, words) for word in words] # print(words) words = ' '.join(words) # print(words) return words def display_topics(model, feature_names, no_top_words): for topic_idx, topic in enumerate(model.components_): l= "Topic %d:" % (topic_idx) l= " ".join([feature_names[i].encode("utf-8") for i in topic.argsort()[:-no_top_words - 1:-1]]) tophash.append(feature_names[i].encode("utf-8")) def topic_hash(hashtags): vectorizer = TfidfVectorizer(min_df=0.2,stop_words='english') X = vectorizer.fit_transform(hashtags) no_topics = min(10,len(hashtags)) nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(X) display_topics(nmf, vectorizer.get_feature_names(), 1) def extract_hash(word): i=0 h='' while(i<len(word)): s=word[i] if(s=='#'): i+=1 while(i<len(word) and not(word[i]=='#' or word[i]==' ' or word[i]=="\n")): s=word[i] h+=s i+=1 hashTags.append(h) h='' else: i+=1 def get_season(now): if isinstance(now, datetime): now = now.date() return next(s for s, (start, end) in seasons if start <= now <= end) def get_tags(tag): tag=tag.replace('u','') tag=tag.replace('[','') tag=tag.replace(']','') tag=tag.replace('\'','') tag= (tag.split(',')) tagnbr.append(len(tag)) i=0 while(i<len(tag)): if(int(tag[i]) not in tags): tags.append(int(tag[i])) i+=1 return tags def get_tags_anoamly(tagA): tagL=tagA.replace('u','') tagL=tagL.replace('[','') tagL=tagL.replace(']','') tagL=tagL.replace('\'','') tagL= (tagL.split(',')) tagnbrA.append(len(tagL)) l=0 while(l<len(tagL)): if(int(tagL[l]) not in tagsA): tagsA.append(int(tagL[l])) l+=1 return tagsA with open(path, 'rb') as f: posts = csv.reader(f) for items in posts: # check date time datetime_object = datetime.strptime(items[3],"%Y-%m-%d %H:%M:%S") hour=datetime_object.hour month=datetime_object.month year=datetime_object.year day=datetime_object.day dates=datetime.date(datetime_object) seasons = [('winter', (date(year, 1, 1), date(year, 3, 20))), ('spring', (date(year, 3, 21), date(year, 6, 20))), ('summer', (date(year, 6, 21), date(year, 9, 22))), ('autumn', (date(year, 9, 23), date(year, 12, 20))), ('winter', (date(year, 12, 21), date(year, 12, 31)))] # activity in 2017 if(year==2017): total_posts_per_year+=1 # posts/month months[month-1]+=1 # posts /day daysX_monthsY[month-1][day-1]+=1 # week number weekNumber = dates.isocalendar()[1] # activity of last 3 weeks in 2017 # total nbr of posts/each week # posts/day in each week # activity time in each day in each week if(weekNumber==51): weekend=datetime_object.weekday() if(weekend==4 or weekend==5): # weekends+=1 postsWeekends52+=1 weekposts52+=1 daysweek[0][(dates.weekday())-1]+=1 if(hour>=6 and hour<18): activityM52+=1 elif(hour>=18 and hour <24): activityN52+=1 elif(hour >=0 and hour <6): activityN52+=1 if(weekNumber==50): weekend=datetime_object.weekday() if(weekend==4 or weekend==5): # weekends+=1 postsWeekends51+=1 weekposts51+=1 if(hour>=6 and hour<18): activityM51+=1 elif(hour>=18 and hour <24): activityN51+=1 elif(hour >=0 and hour <6): activityN51+=1 daysweek[1][(dates.weekday())-1]+=1 if(weekNumber==49): weekend=datetime_object.weekday() if(weekend==4 or weekend==5): # weekends+=1 postsWeekends50+=1 weekposts50+=1 if(hour>=6 and hour<18): activityM50+=1 elif(hour>=18 and hour <24): activityN50+=1 elif(hour >=0 and hour <6): activityN50+=1 daysweek[2][(dates.weekday())-1]+=1 # which type they use most share/add/update/post if(items[4]=='added'): added+=1 elif(items[4]=='updated'): updated+=1 elif(items[4]=='posted'): posted+=1 else: shared+=1 # season if(get_season(datetime_object)=='winter'): season[0]+=1 elif(get_season(datetime_object)=='spring'): season[1]+=1 elif(get_season(datetime_object)=='summer'): season[2]+=1 elif(get_season(datetime_object)=='autumn'): season[3]+=1 weekends= len([1 for i in calendar.monthcalendar(2017, 12) if i[5] != 0]) weekends+=len([1 for i in calendar.monthcalendar(2017, 12) if i[4] != 0]) # lang detector if(items[0]): t=translator.detect(json.dumps(items[0].decode('utf-8'))) langu.append(t.lang) # msg=items[0] # if(t.lang=="en"): # message.append(clean(msg).encode('utf-8')) # tags k=items[5] if(len(k)>2): taggedposts+=1 get_tags(k) urls = extractor.find_urls(items[0]) url.append(urls) urlSize+=len(urls) # nbr of words # word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') # mess=items[0] # mess = unicode(mess, errors='ignore') # words1 = word_tokenizer.tokenize(mess.lower()) # words=[word for word in words1 if # not word in stopwords] # postLength.append(len(words)) # nbr of urls # extract hashTags extract_hash(items[0]) # count hash tags number_hash+=items[0].count('#') # anomaly user with open(pathA, 'rb') as files: postsA = csv.reader(files) for anomaly in postsA: # check date time datetime_object = datetime.strptime(anomaly[3],"%Y-%m-%d %H:%M:%S") hour=datetime_object.hour month=datetime_object.month year=datetime_object.year day=datetime_object.day dates=datetime.date(datetime_object) seasons = [('winter', (date(year, 1, 1), date(year, 3, 20))), ('spring', (date(year, 3, 21), date(year, 6, 20))), ('summer', (date(year, 6, 21), date(year, 9, 22))), ('autumn', (date(year, 9, 23), date(year, 12, 20))), ('winter', (date(year, 12, 21), date(year, 12, 31)))] # activity in 2017 if(year==2017): total_posts_per_yearA+=1 # posts/month monthsA[month-1]+=1 # posts /day daysX_monthsYA[month-1][day-1]+=1 # week number weekNumber = dates.isocalendar()[1] # activity of last 3 weeks in 2017 # total nbr of posts/each week # posts/day in each week # activity time in each day in each week if(weekNumber==51): weekend=datetime_object.weekday() if(weekend==4 or weekend==5): # weekendsA+=1 postsWeekends52A+=1 weekposts52A+=1 daysweekA[0][(dates.weekday())-1]+=1 if(hour>=6 and hour<18): activityM52A+=1 elif(hour>=18 and hour <24): activityN52A+=1 elif(hour >=0 and hour <6): activityN52A+=1 if(weekNumber==50): weekend=datetime_object.weekday() if(weekend==4 or weekend==5): # weekendsA+=1 postsWeekends51A+=1 weekposts51A+=1 if(hour>=6 and hour<18): activityM51A+=1 elif(hour>=18 and hour <24): activityN51A+=1 elif(hour >=0 and hour <6): activityN51A+=1 daysweekA[1][(dates.weekday())-1]+=1 if(weekNumber==49): weekend=datetime_object.weekday() if(weekend==4 or weekend==5): # weekendsA+=1 postsWeekends50A+=1 weekposts50A+=1 if(hour>=6 and hour<18): activityM50A+=1 elif(hour>=18 and hour <24): activityN50A+=1 elif(hour >=0 and hour <6): activityN50A+=1 daysweekA[2][(dates.weekday())-1]+=1 # which type they use most share/add/update/post if(anomaly[4]=='added'): addedA+=1 elif(anomaly[4]=='updated'): updatedA+=1 elif(anomaly[4]=='posted'): postedA+=1 else: sharedA+=1 # season if(get_season(datetime_object)=='winter'): seasonA[0]+=1 elif(get_season(datetime_object)=='spring'): seasonA[1]+=1 elif(get_season(datetime_object)=='summer'): seasonA[2]+=1 elif(get_season(datetime_object)=='autumn'): seasonA[3]+=1 weekendsA= len([1 for i in calendar.monthcalendar(2017, 12) if i[5] != 0]) weekendsA+=len([1 for i in calendar.monthcalendar(2017, 12) if i[4] != 0]) # tags kA=anomaly[5] if(len(kA)>2): taggedpostsA+=1 get_tags_anoamly(kA) # lang detector tA=translator.detect(json.dumps(anomaly[0].decode('utf-8'))) languA.append(tA.lang) # msgA=anomaly[0] # if(tA.lang=='en'): # messageA.append(clean(msgA).encode('utf-8')) urlsA = extractor.find_urls(items[0]) urlA.append(urlsA) urlSizeA+=len(urlsA) # # nbr of words # word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') # mess=anomaly[0] # mess = unicode(mess, errors='ignore') # words1 = word_tokenizer.tokenize(mess.lower()) # words=[word for word in words1 if # not word in stopwords] # postLength.append(len(words)) # nbr of urls urls = extractor.find_urls(anomaly[0]) urls_Size+=len(urls) # extract hashTags extract_hash(anomaly[0]) # count hash tags number_hash+=anomaly[0].count('#') scoreA1=2 scoreB1=2 import topicModel scoreTA,scoreTB= topicModel.topic(userID,userID,scoreA1,scoreB1) import lang scoreLA,scoreLB= lang.language(langu, languA,total_posts_per_year,total_posts_per_yearA,scoreTA,scoreTB) # print added,updated,posted,shared # print addedA,updatedA,postedA,sharedA import typeOfPost scorePA,scorePB= typeOfPost.type_post(added,shared,updated,posted,total_posts_per_year,addedA,sharedA,updatedA,postedA,total_posts_per_yearA,personalityTypePost[per],scoreLA,scoreLB) import tagged scoreTagA,scoreTagB= tagged.tagged(tags,tagnbr,taggedposts,total_posts_per_year,tagsA,tagnbrA,taggedpostsA,total_posts_per_yearA,scorePA,scorePB) import freq scoreFA,scoreFB= freq.frequency(postsWeekends52,postsWeekends51,postsWeekends50,weekends,postsWeekends52A,postsWeekends51A, postsWeekends50A,weekendsA,activityM52, activityM51, activityM50, activityN52, activityN51, activityN50, activityM52A, activityM51A, activityM50A, activityN52A, activityN51A, activityN50A, personalityActivityTime[per] ,daysweek, daysweekA,season,total_posts_per_year,seasonA,total_posts_per_yearA,weekposts52,weekposts51,weekposts50,weekposts52A, weekposts51A,weekposts50A,personalityDay[per],scoreTagA,scoreTagB) # import links # links.link(url, urlSize, total_posts_per_year, urlA, urlSizeA, total_posts_per_yearA) return scoreFA,scoreFB
import threading import time from typing import Dict, List, NoReturn, Optional, Tuple import ircstyle import miniirc import urlextract from . import config from .title import url_title_reader from .util.urllib import validate_parsed_url PUNCTUATION = tuple(string.punctuation) log = logging.getLogger(__name__) url_extractor = urlextract.URLExtract() # pylint: disable=invalid-name def _alert(irc: miniirc.IRC, msg: str, loglevel: int = logging.ERROR) -> None: log.log(loglevel, msg) irc.msg(config.INSTANCE["alerts_channel"], msg) class Bot: """Bot.""" EXECUTORS: Dict[str, concurrent.futures.ThreadPoolExecutor] = {} QUEUES: Dict[str, queue.SimpleQueue] = {} def __init__(self) -> None: log.info("Initializing bot as: %s",
def parallel_file_extraction_worker(file): extractor = urlextract.URLExtract() with open(file) as f: return list(map(lambda x: (file, x), extractor.find_urls(f.read())))
def check_for_emails(): threading.Timer(10, check_for_emails).start() mail = imaplib.IMAP4_SSL("imap.gmail.com") mail.login("*****@*****.**", open("password.txt", "r").read().strip()) for folder in ["\"[Gmail]/Kaikki viestit\"", "\"[Gmail]/Roskaposti\""]: res = mail.select(folder) status, data = mail.search(None, "UnSeen") mail_ids = [] for block in data: mail_ids += block.split() if len(mail_ids) > 0: print(str(len(mail_ids)) + " new emails") for i in mail_ids: status, data = mail.fetch(i, "(RFC822)") for response_part in data: if isinstance(response_part, tuple): message = email.message_from_bytes(response_part[1]) mail_from_original = message["from"] mail_subject = message["subject"] mail_time = dparser.parse( message["received"].split("\n")[1].strip(), fuzzy=True).astimezone().strftime("%d/%m/%y %H %p") sender_search = re.search(r"[\w\.-]+@[\w\.-]+\.\w+", mail_from_original) if sender_search: mail_from_original = sender_search.group(0) if message.is_multipart(): mail_content = "" for part in message.get_payload(): if part.get_content_type() in [ "text/plain", "text/html", "multipart/alternative" ]: if type(part.get_payload()) == list: for m in part.get_payload(): tmp = m.as_string() tmp = tmp.split("\n\n", 1)[1] try: tmp = base64.b64decode(tmp).decode( "utf-8") except: pass mail_content += tmp break else: mail_content += part.get_payload() else: mail_content = message.get_payload() try: mail_content = base64.b64decode(mail_content).decode( "utf-8") except: pass mail_from = mail_from_original if "Subject: " in mail_content or "From: " in mail_content: for l in mail_content.split("\n"): if l.startswith("Subject: "): mail_subject = l[len("Subject: "):] if l.startswith("From: "): mail_from = l[len("From: "):] if l.startswith("Von: "): mail_from = l[len("Von: "):] mail_subject = mail_subject.lower() while mail_subject.startswith( "fw: ") or mail_subject.startswith("fwd: "): if mail_subject.startswith("fw: "): mail_subject = mail_subject[4:] if mail_subject.startswith("fwd: "): mail_subject = mail_subject[5:] sender_search = re.search(r"[\w\.-]+@[\w\.-]+\.\w+", mail_from) if sender_search: mail_from = sender_search.group(0) links = urlextract.URLExtract().find_urls(mail_content) for i in range(len(links)): links[i] = urlparse(links[i]).hostname links = list(set(links)) for i in range(len(links) - 1, 0, -1): if links[i] is None or len(links[i]) == 0: del links[i] mailObject = { "reporter": mail_from_original, "subject": mail_subject, "links": links, "time": mail_time } add_message(mail_from, mailObject)
def extract_urls( text: str, log: logging.Logger, examples: bool = False, common: bool = False, ) -> set: """ Return a list of URLs in a text string. @param text The text to extract URLs from @param log The log @param examples Include example URLs @param common Include URLs that are common in IETF documents @return List of URLs. """ # find all URLs extractor = urlextract.URLExtract() extractor.update_when_older(7) # update TLDs when older than 7 days text = unfold(text) urls = [] for url in extractor.gen_urls(text): url = url.rstrip(".\"]'>;,") # if not re.search(r"://", url): # url = "http://" + url if re.match(r"[\d\.:a-f]+", url, flags=re.IGNORECASE): # skip literal IP addresses continue try: urllib.parse.urlparse(url).netloc except ValueError as err: log.warning("%s: %s", err, url) continue urls.append(url) if not examples: # remove example URLs urls = [ u for u in urls if not re.search( r"example\.(com|net|org)|\.example", urllib.parse.urlparse(u).netloc if urllib.parse.urlparse(u). netloc else u, re.IGNORECASE, ) ] if not common: # remove some common URLs urls = [ u for u in urls if not re.search( r"""https?:// datatracker\.ietf\.org/drafts/current/| trustee\.ietf\.org/license-info| (www\.)?rfc-editor\.org/info/rfc\d+| (www\.)?ietf\.org/archive/id/draft-""", u, flags=re.VERBOSE | re.IGNORECASE, ) ] return set(urls)
def __init__(self, client): self.client = client self.extractor = urlextract.URLExtract() self.command_handler = CommandHandler.CommandHandler(client)
ctype = part.get_content_type() if not ctype in ("text/plain", "text/html"): continue try: content = part.get_content() except: content = str(part.get_payload()) if ctype == "text/plain": return content else: html = content if html: return html_to_plain_text(html) urlextract = urlextract.URLExtract() stemmer = nltk.PorterStemmer() class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin): def __init__(self, strip_header=True, lower_case=True, remove_punctuation=True, replace_urls=True, replace_number=True, stemming=True): self.strip_header = strip_header self.lower_case = lower_case self.remove_punctuation = remove_punctuation self.replace_urls = replace_urls
try: import nltk # Natural Language Toolkit stemmer = nltk.PorterStemmer() # suffix stripper for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"): print(word, "=>", stemmer.stem( word)) # output:comput. It's a suffix stripping function. except ImportError: print("Error: stemming requires the NLTK module.") stemmer = None try: import urlextract # may require an Internet connection to download root domain names url_extractor = urlextract.URLExtract( ) # a tool which can find urls in a sentence. print( url_extractor.find_urls( "Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s" )) except ImportError: print("Error: replacing URLs requires the urlextract module.") url_extractor = None print('-----preparation done-----') print('make classifier') X_few = X_train[:3] # train sample X_few_wordcounts = EmailToWordCounterTransformer().fit_transform( X_few) # count emails' words
def urlExtract(text): extr = urlextract.URLExtract() urls = extr.find_urls(text=text) return urls
def __init__(self, bot: commands.Bot): self.bot = bot self.urlextractor = urlextract.URLExtract()
from nltk.corpus import stopwords from nltk.tokenize import TweetTokenizer import string import uritools import urlextract from langdetect import detect from nltk.stem.porter import PorterStemmer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import f1_score from sklearn.preprocessing import normalize from sklearn.metrics.pairwise import cosine_similarity extractor = urlextract.URLExtract() ps = PorterStemmer() data = {'airline_sentiment':[],'text':[]} airline_sentiment = [] corpus = [] with open('Tweets.csv', 'r', encoding='utf8') as f: tweets = csv.reader(f) for row in tweets: data['airline_sentiment'].append(row[1]) data['text'].append(row[10]) corpus.append(row[10])
def hsparser(Filecontent): if Filecontent: import wordsegment import re import urlextract ue = urlextract.URLExtract() #print(Filecontent) urls = ue.find_urls(Filecontent) for yu in urls: Filecontent = Filecontent.replace(yu, 'url') listofchar = [ '_', '"', '\'', '“', '”', '’', '/', '\\', '-', ',', '`', '.', '>', '<', '?', '!', '#', '%', '&', '(', ')', '=', '+', '|', ';', ':', '~', ']', '[', '{', '}', '*', '^' ] for lis in listofchar: Filecontent = Filecontent.replace(lis, '') listofmoney = ['$', '£', '€'] for lis in listofmoney: Filecontent = Filecontent.replace(lis, ' msign') s1 = Filecontent.split() brange = len(s1) b = 0 while (b < brange): s1[b] = s1[b].lower() if (re.match(r"\S+@\S+", s1[b])): s1[b] = 'emailaddress' elif (re.match(r"http", s1[b])): s1[b] = 'url' elif (re.match(r"^msign[0-9]+$", s1[b])): #print("money") #print(s1[b]) s1[b] = 'mamount' elif (re.match(r"^[0-9]+$", s1[b])): if (len(s1[b]) == 10): #print("phone") #print(s1[b]) s1[b] = 'pno' else: s1.pop(b) b = b - 1 brange = brange - 1 elif (not re.match(r"^[0-9a-zA-Z]+$", s1[b])): #print("not english") #print(s1[b]) s1.pop(b) b = b - 1 brange = brange - 1 elif (len(s1[b]) >= 20): #print("too long") #print(s1[b]) s1[b] = 'emessage' b = b + 1 result = '' i = 0 i2 = 1 #while i <= lent-1: # str = s1[i] + ' ' + s1[i+1] #try: #if wordsegment.BIGRAMS[str]>=1: #s1[i] = str #lent = lent - 1 # s1.pop(i+1) #except Exception as inst: # pass # i = i + 1 #print(s1) #s1.extend(s2) return s1 else: return []
def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None: super().__init__(component_config) self.extractor = urlextract.URLExtract()
import urlextract from ._html import get_html_hrefs try: _url_extractor = urlextract.URLExtract(cache_dns=False) except Exception: _url_extractor = urlextract.URLExtract() MAX_URLS = 1000 def _url_extractor_base(content): try: return _url_extractor.gen_urls(content, check_dns=True) except Exception: return _url_extractor.gen_urls(content) def _url_extractor_wrapper(content, url=None): extractor = _url_extractor_base(content) seen = set() for url in extractor: # package and cffi end up with repo`` # https://github.com/lipoja/URLExtract/issues/13 url = url.strip("`") if url.endswith(".html."): # lit url = url[:-1] yield url