def test_stemmer(self): line = "мамочка свари суп" #tok = Tokenizer().tokenize_alph(line) fact = list(Stemmer().stem(Token(0, 7, line, 'a'), 4, line)) check = [Token(0, 7, line, 'a'), Token(0, 6, line, 'a'), Token(0, 5, line, 'a'), Token(0, 4, line, 'a'), Token(0, 3, line, "a")] fact1 = list(Stemmer().stem(Token(14, 17, line, "a"), 4, line)) check1 = [Token(14, 17, line, "a")] self.assertEqual(fact, check) self.assertEqual(fact1, check1)
def __init__(self, config=None): self.tmp_for_entites = {} self.stop_words = stopwords.words('english') + [ '?', '!', ',', '+', '-', '*', '/', '"', '.', '<', '>', '=', ':', '', '{', '{}', '}', '[', ']', '[]', 'are', 'and', 'an', 'at', 'am', 'a', 'even', 'every', 'everyone', 'rt', 'RT' ] self.global_dict = {} #value=number of docs self.post_dict = { } # key="word",value=[parquet name,index in parquet,tweet id,frequency in tweet,location in tweet,tf] self.entities = {} self.path_stop_words = [ 'RT', "rt", 'tweet', 'www', 'http', 'https', 'WWW' ] self.corona_list = [ "cov", 'corona', 'coronavirus', 'covid', 'covid19', 'covid 19', 'corona virus', 'virus corona', 'corona_virus', 'virus_corona', "virus" ] self.config = config self.trump = [ "donald", "donald trump", "trump donald", "president", "trump_donald", "donald_trump", "trump-donald", "donald-trump" ] self.stemmer = None if self.config.toStem: self.stemmer = Stemmer()
def __init__(self, docs_dir, docs_size): self.docLoader = DocLoader(docs_dir, docs_size) self.tokenizer = Tokenizer() self.stemmer = Stemmer() self.dictionary = Dictionary(load=False) self._clean() self._setup(docs_size)
def __init__(self, stem): self.stop_words = stopwords.words('english') self.stop_words.extend([ 'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than', 'rt', "don't", '-', '&', 'it’s', 'don’t', 'i’m', "it's", "doesn't", 'https', 't.co', 'twitter.com', 'weve', 'ur', 'due', 'damn', 'us', 'theyre', 'would', 'might' ]) self.stop_words_dict = { self.stop_words[i]: 0 for i in range(0, len(self.stop_words)) } # self.extra_stop_words = {"rt": 0, "https": 0, "t.co": 0, "twitter.com": 0, "weve": 0, "ur": 0, "due": 0, "damn": 0, "us": 0, "theyre": 0, "would": 0, "might": 0} # self.stop_words_dict.update(self.extra_stop_words) self.term_dict = {} self.toStem = stem self.text_tokens = [] if self.toStem: self.stemmer = Stemmer()
def __init__(self, stemming): self.stop_words = stopwords.words('english') self.stop_words.extend( ['rt', '“', r'’', r'n\'t', 'n\'t', '\'s', r'\'s', r'\'ve', r'\'m', '...', r'\'\'', r'\'d', '&', r'\'ll', r'\'re', r' ', r'', r"", r"''", r'""', r'"', r"“", "”", r"’", "‘", r"``", '``', r"'", r"`", r'!', r'?', r',', r':', r';', r'(', r')', r'...', r'[', ']', r'{', '}' "'&'", '.', r'\'d', '-', '--']) self.stop_words_dict = dict.fromkeys(self.stop_words) self.text_tokens = None self.stemmer = None if stemming: self.stemmer = Stemmer() self.hashtag_split_pattern = re.compile(r'[a-zA-Z0-9](?:[a-z0-9]+|[A-Z0-9]*(?=[A-Z]|$))') self.take_off_non_latin = re.compile( pattern=r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF\u2019]') self.left_slash_pattern = re.compile(r'^-?[0-9]+/0*[1-9][0-9]*$') self.right_slash_pattern = re.compile(r'^-?[0-9]+\\0*[1-9][0-9]*$') self.days_dict = {"Sat": "saturday", "Sun": "sunday", "Mon": "monday", "Tue": "tuesday", "Wed": "wednsday", "Thu": "thursday", "Fri": "friday"} self.months_dict = {"Jul": ("july", "07"), "Aug": ("august", "08")} self.kbm_shorts = {"k": None, "m": None, "b": None, "K": None, "M": None, "B": None}
def test_stemmer_flex(self): line = "мамочка свари суп" fact = list(Stemmer().stem_flex(Token(0, 8, "мамочка свари суп", "a"))) check = [Token(0, 8, line, 'a'), Token(0, 7, line, 'a')] self.assertEqual(fact, check)
def __init__(self, stemming=False): self.stemming = stemming self.toStem = Stemmer() self.terms_dic_to_document = {} #self.lower_set = set() #self.upper_set = set() self.numberList = { "thousand": 'K', "million": 'M', "billion": 'B', "percentage": '%', "percent": '%', "dollar": '$' } self.stop_words = stopwords.words('english') # contains of all stop words acording to thiers first letter self.dict_stop_words = { 'a': [], 'b': [], 'c': [], 'd': [], 'e': [], 'f': [], 'g': [], 'h': [], 'i': [], 'j': [], 'k': [], 'l': [], 'm': [], 'n': [], 'o': [], 'p': [], 'q': [], 'r': [], 's': [], 't': [], 'u': [], 'v': [], 'w': [], 'x': [], 'y': [], 'z': [] } # build the dic of stop Word for w in self.stop_words: self.dict_stop_words[w[0]].append(w) # all operator we dont want and all parentheses character and all separators character self.skip_list = { ',', ';', ':', ' ', '\n', '(', ')', '[', ']', '{', '}', '*', '+', '-', '/', '<', '>', '&', '=', '|', '~', '"' } # all wired symbols self.wird_symbols = { '!', '#', '$', '%', '&', '(', ')', ',', '*', '+', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', "'\'", ']', '^', '`', '{', '|', '}', '~', '}' }
def add_new_doc(self, document, documents_list_length=10000): """ This function perform indexing process for a document object. Saved information is captures via two dictionaries ('inverted index' and 'posting') :param document: a document need to be indexed. :return: - """ try: document_dictionary = document.term_doc_dictionary # self.countDoc += 1 for term in document_dictionary.keys(): if self.stemming == 'y': my_stemmer = Stemmer() term = my_stemmer.stem_term(term) # Update inverted index and posting if term not in self.inverted_idx.keys(): self.inverted_idx[term] = [ 1, [(document_dictionary[term], document.tweet_id)] ] # amount of doc, freq in the doc, doc id. else: self.inverted_idx[term][0] += 1 # amount of doc self.inverted_idx[term][1].append( (document_dictionary[term], document.tweet_id)) # freq in the doc # doc id if term not in self.postingDict.keys(): self.postingDict[term] = [(document.tweet_id, document_dictionary[term])] else: self.postingDict[term].append( (document.tweet_id, document_dictionary[term])) # self.countTweet -= 1 if document.tweet_id not in self.tweet_dict.keys(): self.tweet_dict[document.tweet_id] = [ [term, document_dictionary[term]], 1, 0 ] # [term,freq in tweet], amount of unique terms in tweet, amount of terms in tweet elif document_dictionary[term] > self.tweet_dict[ document.tweet_id][0][ 1]: # tweet exist, compering between freq in two terms if self.tweet_dict[document.tweet_id][0][ 1] == 1: # before change term check if the last term is unique self.tweet_dict[document.tweet_id][ 1] += 1 # last term is unique: add to the amount of uniqe terms in tweet self.tweet_dict[document.tweet_id][0] = [ term, document_dictionary[term] ] # change between the terms self.tweet_dict[document.tweet_id][2] += 1 elif document_dictionary[ term] == 1: # tweet exist, not most common, check if unique self.tweet_dict[document.tweet_id][1] += 1 self.tweet_dict[document.tweet_id][2] += 1 except: # print('problem in indexer : add_new_doc') # print(traceback.print_exc()) pass
def test_VC_measure(self): """Tests the VC measure.""" stemmer = Stemmer() for word, measure in VC_DATA.items(): self.failUnless( stemmer.m(word) == measure, "Measure test failed for word '%s' calculated (%d) \ should have been (%d)" % (word, stemmer.m(word), measure))
def __init__(self, config): self.word_dict = {} self.stemmer = Stemmer(config.stemming) self.stop_words = [ self.stemmer.stem_term(word) for word in stopwords.words('english') ] + ['rt', 't.co', 'https'] self.rules = config.parser_rules self.spell = SpellChecker() self.min_length = config.min_length
def __init__(self, with_stemmer=False, include_urls=False, include_quote=False, debug=False, timer=False): self.stemmer = Stemmer() self.with_stemmer = with_stemmer self.include_urls = include_urls self.include_quote = include_quote self.stop_words = stopwords.words('english') self.stop_words += ["i'm", "it's", 'they', "i've", 'you', 'u', 'we', 'rt', 'im', 'use', 'sure', ] self.debug = debug self.timer = timer self.times = []
def __init__(self, stemming): self.stop_words = stopwords.words('english') self.stop_words += ["rt", "http", "https", "www", "twitter.com"] # TODO: check & self.terms = set() self.nonstopwords = 0 self.max_tf = 0 self.toStem = stemming self.entities = {} if self.toStem: self.stemmer = Stemmer()
def test_stem(self): """Checks the final stems.""" stemmer = Stemmer() output = file('output.txt') for word in file('voc.txt'): word = word.strip() stem = output.next().strip() self.failUnless(stemmer.stem(word) == stem, "Test failed for word \'%s\' stemmed "\ "to %s should have been %s"\ % (word, stemmer.stemmed, stem))
def __init__(self, config): self.with_stem = config.get_toStem() self.stemmer = Stemmer() self.stop_words = stopwords.words('english') self.stop_words.extend([ r' ', r'', r"", r"''", r'""', r'"', r"“", r"”", r"’", r"‘", r"``", r"'", r"`", '"' ]) self.stop_words.extend([ 'rt', r'!', r'?', r',', r':', r';', r'(', r')', r'...', r'[', ']', r'{', '}' "'&'", '$', '.', r'\'s', '\'s', '\'d', r'\'d', r'n\'t' ]) self.stop_words.extend(['1️⃣.1️⃣2️⃣']) self.stop_words_dict = dict.fromkeys(self.stop_words) # for avg self.total_len_docs = 0 self.number_of_documents = 0 self.url_pattern = re.compile('http\S+') self.url_www_pattern = re.compile("[/://?=]") # TODO - fix numbers pattern self.numbers_pattern = re.compile(('^\d+([/|.|,]?\d+)*')) self.non_latin_pattern = re.compile( pattern= r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF\u2019]' ) self.dates_pattern = re.compile( r'^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[13-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$' ) # TODO - fix emoji to include all emojis self.emojis_pattern = re.compile( pattern="[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002500-\U00002BEF" # chinese char u"\U00010000-\U0010ffff" u"\U0001f926-\U0001f937" u"\U000024C2-\U0001F251" u"\U00002702-\U000027B0" u"\u2640-\u2642" u"\u200d" u"\u23cf" u"\u23e9" u"\u231a" u"\ufe0f" u"\u3030" u"\u2600-\u2B55" u"\uFE0F\u20E3\uFE0F\u20E3\uFE0F\u20E3" "]+", flags=re.UNICODE)
def __init__(self, config=None, advanced=False): # stopwords_to_add = ['rt'] self.english_word = words.words() self.stop_words = stopwords.words('english') puncs_to_add = ['...', '', '\'', '“', '”', '’', '…'] self.punctuators = [punc for punc in string.punctuation] + puncs_to_add self.tt = TweetTokenizer() self.stemmer = Stemmer() self.need_stemming = config.toStem if isinstance( config, ConfigClass) else False self.caps_dict = {} self.rules_dict = {} self.advanced = advanced
def parse_doc(self, doc_as_list): """ This function takes a tweet document as list and break it into different fields :param doc_as_list: list re-preseting the tweet. :return: Document object with corresponding fields. """ tweet_id = doc_as_list[0] tweet_date = doc_as_list[1] full_text = doc_as_list[2] url = doc_as_list[3] indice = doc_as_list[4] retweet_text = doc_as_list[5] retweet_url = doc_as_list[6] retweet_indice = doc_as_list[7] quote_text = doc_as_list[8] quote_url = doc_as_list[9] quoted_indice = doc_as_list[10] retweet_quoted_text = doc_as_list[11] retweet_quoted_url = doc_as_list[12] retweet_quoted_indice = doc_as_list[13] term_dict = {} tokenized_text = self.parse_sentence(full_text) tokenized_quote = self.parse_sentence(quote_text) tokenized_url = self.handle_url(url) doc_length = len( tokenized_text) # after text operations - length of full_text new_tokenized_text = tokenized_text + tokenized_url + tokenized_quote if self.stemming is True: s = Stemmer() for token in new_tokenized_text: new_tokenized_text.append(s.stem_term(token)) new_tokenized_text.remove(token) for term in new_tokenized_text: if term is not "": # or (term.isalpha() and len(term) == 1) if term not in term_dict: term_dict[term] = 1 else: term_dict[term] += 1 document = Document(tweet_id, tweet_date, full_text, url, retweet_text, retweet_url, quote_text, quote_url, term_dict, doc_length) return document
def __init__(self, stemming=0): """ This function initiate the fields of Parse, init the stemmer and entering stop words :param stemming: the boolean value is stem is needed (optional) """ self.stemming = stemming self.stemmer = Stemmer() # self.stop_words = frozenset(stopwords.words('english')) ?????????????????????????????????????????????????????? self.stop_words = stopwords.words('english') self.stop_words.extend([ ':', '\'s', '.', ',', ';', '’', '?', '!', 'rt', '-', '|', '~', '(', ')', '*', '+', '=' '/', '"', '``', '\'\'', '\n', '\n\n', '&', 'amp', '…', '\'', '`', '[', ']', '{', '}' ])
def __init__(self, stemming=None): """ constructor for this class :param stemming: """ self.stop_words = stopwords.words('english') self.stemmer = None if stemming: self.stemmer = Stemmer() self.corona_list = [ "SARS", "sars", "Severe Acute Respiratory Syndrome", "severe acute respiratory syndrome", "SARS-CoV", "SARS CoV", "sars-cov", "sars cov", "coronavirus", "corona virus", "COVID", "covid", "Covid", "COVID-19", "covid-19", "#coronavirus", "COVID__19", "#COVID", "#COVID-19", "#covid19", "#SARS" ]
def __init__(self, rootPath="", inputFolder=""): self.metadata = Metadata() self.stopper = Stopper() stopwords_folder = os.path.join(rootPath, "stopwords") print("Preprocessor root path: ", rootPath) self.stopper.load_stopwords(stopwords_folder) self.normalizer_tokenizer = NormalizationTokenization() self.stemmer = Stemmer() self.p1_path = "" self.p2_path = "" self.p3_path = "" self.rootPath = rootPath self.inputFolder = inputFolder
def __init__(self, config): self.with_stem = config.get_toStem() self.stemmer = Stemmer() self.stop_words = stopwords.words('english') self.stop_words.extend(['RT']) self.stop_words_dict = dict.fromkeys(self.stop_words) # for avg self.total_len_docs = 0 self.number_of_documents = 0 self.url_removal_pattern = re.compile(r'(https?://[^\s]+)') # TODO - fix numbers pattern self.numbers_pattern = re.compile(('^\d+([/|.|,]?\d+)*')) self.dates_pattern = re.compile( r'^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[13-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$' )
def __init__(self, stemming): self.stop_words = stopwords.words('english') self.stop_words.extend([ 'rt', '“', r'’', r'n\'t', 'n\'t', '\'s', r'\'s', r'\'ve', r'\'m', '...', r'\'\'', r'\'d', '&', r'\'ll', r'\'re', r' ', r'', r"", r"''", r'""', r'"', r"“", "”", r"’", "‘", r"``", '``', r"'", r"`", r'!', r'?', r',', r':', r';', r'(', r')', r'...', r'[', ']', r'{', '}' "'&'", '.', r'\'d', '-', '--', 'mask', 'pandemic', 'people', 'wear', 'trump', 'masks', 'new', 'virus', 'wearing', 'cases', 'amp', 'us', 'like' ]) # , 'covid', '19', 'covid-19', 'mask', 'coronavirus', 'pandemic', 'people', 'wear', 'trump', 'covid19', 'masks', 'new', 'virus', 'wearing', 'cases', 'amp', '#covid19', 'us', 'like' self.stop_words_dict = dict.fromkeys(self.stop_words) self.text_tokens = None self.stemmer = None if stemming: self.stemmer = Stemmer()
def __init__(self, corpus=None, cxp=True, swr=True, nr=True, stem=True): if corpus != None: self.corpus_path = Path(str(corpus)) else: self.corpus_path = None self.contraction_expansion_flag = False self.stop_word_flag = False self.noise_removal_flag = False self.stemmer_flag = False if cxp: self.contraction_expansion_flag = True self.contraction_expander = ContractionExpander() if swr: self.stop_word_flag = True self.stop_word_remover = StopWordRemover() if nr: self.noise_removal_flag = True self.noise_remover = NoiseRemover() if stem: self.stemmer_flag = True self.stemmer = Stemmer()
def search_and_rank_query(queries, inverted_index, k, lda): #print("start:", datetime.now()) # config = ConfigClass() indexer = Indexer(config) # indexer = Indexer(config) to_stem = config.get__toStem() # to_stem = config.get__toStem() queries_list = [] if type(queries) is list: # if queries is a list for query in queries: queries_list.append(query) if type(queries) is str: # if queries is a text file with open(queries, encoding='utf-8') as f: for line in f: if line != "\n": queries_list.append(line) all_results = [] query_num = 1 tweet_id_num = 1 for query in queries_list: p = Parse(config) # parse LDA query tokenized_query = p.parse_sentence(query, 0) original_query_list = query.split(" ") stop_words = stopwords.words('english') original_query_list = [ w for w in original_query_list if w not in stop_words ] # find long terms and upper case words counter = 0 while counter < len(original_query_list): len_term = 1 word = original_query_list[counter] if word.isupper(): # NBA if word.find("\n") != -1: word = word[:-1] if word.find(".") != -1: word = word[:-1] if not to_stem: tokenized_query.append(word) else: stem_word = Stemmer().stem_term(word) tokenized_query.append(stem_word) elif len(word) > 1 and re.search( '[a-zA-Z]', word) and word[0].isupper(): # upper first char term = word if original_query_list.index(word) + 1 < len( original_query_list): index = original_query_list.index(word) + 1 while index < len(original_query_list): # find all term if len(original_query_list[index]) > 1 and re.search('[a-zA-Z]', original_query_list[index]) and \ original_query_list[index][0].isupper(): new_word2 = original_query_list[index][ 0] + original_query_list[index][1:].lower( ) # Donald Trump term += " " + new_word2 index += 1 len_term += 1 else: break if len_term > 1: tokenized_query.append(term) counter += len_term #print(tokenized_query) # WordNet query wn = WordNet_ranker(tokenized_query) WordNet_query = wn.extend_query() #print("WordNet_query", WordNet_query) searcher = Searcher(inverted_index) #print("inverted_index", len(inverted_index)) # find relevant_docs relevant_docs = searcher.relevant_docs_from_posting(WordNet_query) #print("relevant", len(relevant_docs)) # find LDA relevant cosine_dict = lda.prob(tokenized_query) #print("cosine dict", len(cosine_dict)) dict_of_cosine_tweets = {} #list out keys and values separately key_list = list(indexer.tweet_line_dict.keys()) val_list = list(indexer.tweet_line_dict.values()) for index in cosine_dict.keys(): # find the tweet id dict_of_cosine_tweets[key_list[val_list.index( index)]] = cosine_dict[index] #print("finish_topic relevant", len(dict_of_cosine_tweets)) final_dict = {} for tweet_id in dict_of_cosine_tweets.keys(): if k > len(final_dict): if tweet_id in relevant_docs: final_dict[tweet_id] = 0 final_dict[tweet_id] += (relevant_docs[tweet_id] + dict_of_cosine_tweets[tweet_id]) sorted_cosine_tweets = { k: v for k, v in sorted( final_dict.items(), key=lambda item: item[1], reverse=True) } final_tweets = list(sorted_cosine_tweets.keys()) #print("final before add K", len(final_tweets)) if k > len(final_tweets): for key in relevant_docs.keys(): if key not in final_dict: if k > len(final_tweets): final_tweets.append(key) if k == len(final_tweets): break #print("final after K", len(final_tweets)) #print("relevant", relevant_docs) #print("sorted_cosine_tweets", sorted_cosine_tweets) """for tweet in relevant_docs.keys(): if tweet in list_of_cosine_tweets: if len(final_tweets) < k: final_tweets.append(tweet) if len(final_tweets) < k: sorted_cosine_tweets = {k: v for k, v in sorted(list_of_cosine_tweets.items(), key=lambda item: item[1], reverse=True)} for key in sorted_cosine_tweets: if k > len(final_tweets) and key not in final_tweets: final_tweets.append(key) else: break""" # write the results into csv file tweet_id_num = 1 s = "" with open('results.csv', 'a', encoding='utf-8') as fp: for p in final_tweets: s = ("Tweet id: " + "{" + p + "}" + " Score: " + "{" + str(tweet_id_num) + "}" + "\n") tweet_id_num += 1 fp.write(s) query_num += 1 all_results.append(final_tweets) #print("end:", datetime.now()) # return top K of final_tweets return all_results
#query = pattern.getPhoneticCode() #document = searchEntry5.getPhoneticCode() #print query #print document #print " " #print pattern.data.comparePhoneticCodeLists(query, document) #varList = ["halten", "hielt", "gehalt", "haltbar"] #so = Stemmer("") #print so.successorVariety ("gehalten", varList) #varObject = Phonetics("") #sv = varObject.calcSuccVarietyList(varList) #print sv #svm = varObject.calcSuccVarietyMerge(sv) #print svm #print varObject.calcSuccVarietyCount(svm) #text = Advas(["die Kinder freuen sich über die Kastanien"], "") #keywordList = ["die", "der", "das", "sich"] #print text.isLanguageByKeywords (keywordList) #text = Advas(["Schule"], "") #print text.getSynonyms("/home/frank/projekte/openthesaurus/openthesaurus.txt", "") #print text.isSynonymOf("Bildungszentrum", "/home/frank/projekte/openthesaurus/openthesaurus.txt", "") # -- ngram stemmer stemmerObject = Stemmer("") print stemmerObject.ngramStemmer( ["halten", "hielt", "halter", "halt", "gehalten"], 2, 0.4)
def parse_sentence(self, text, tweet_id): """ This function tokenize, remove stop words and apply lower case for every word within the text :param text: :return: """ # print(text) text_tokens = word_tokenize(text) if text_tokens[0] == 'RT': return [] # find TAGS if "@" in text_tokens: index_list1 = [n for n, x in enumerate(text_tokens) if x == '@'] counter = 0 for index in index_list1: if index + 1 < len(text_tokens): if text_tokens[index + 1] != '@': new_term = text_tokens[index] + text_tokens[index + 1] text_tokens.append(new_term) counter += 1 for sign in range( counter ): # deletes all '@' and the word after it from list rmv_index = text_tokens.index('@') if rmv_index + 1 < len(text_tokens): if text_tokens[rmv_index + 1] != '@': del text_tokens[rmv_index + 1] else: del text_tokens[rmv_index + 1] del text_tokens[rmv_index + 1] text_tokens.remove('@') ############################################################################################## # find PERCENTAGES if "%" or "percent" or "Percent" or "percentage" or "Percentage" in text_tokens: index_list2 = [ n for n, x in enumerate(text_tokens) if x == '%' or x == 'percent' or x == "percentage" or x == 'Percent' or x == "Percentage" ] counter2 = 0 for index in index_list2: if index - 1 >= 0: if not re.search('[a-zA-Z]', text_tokens[index - 1]): new_term = text_tokens[index - 1] + '%' text_tokens.append(new_term) if text_tokens[index] == '%': counter2 += 1 while counter2 > 0: # deletes all '%' and the word after it from list rmv_index = text_tokens.index('%') if rmv_index + 1 < len(text_tokens) and text_tokens[ rmv_index + 1] == '%': #if %% del text_tokens[rmv_index + 1] counter2 -= 1 if rmv_index - 1 >= 0 and not re.search( '[a-zA-Z]', text_tokens[rmv_index - 1]): #is number del text_tokens[rmv_index] del text_tokens[rmv_index - 1] counter2 -= 1 ############################################################################################## # finding terms, entities and capital letter self.parse_term(text_tokens, tweet_id) ############################################################################################## # find NUMBERS numbers = [] for item in text_tokens: #([0-9]+[,.]+[0-9]+) item.isnumeric() or item.isdigit() or item.isdecimal() or if re.findall("^\d+$|^[0-9]{1,3}([,.\/][0-9]{1,3}){0,6}$", item) and not re.search( '[a-zA-Z]', item): #^\d+$|^[0-9]{1,3}([,.][0-9]{1,3})?$ if item.find('-') == -1 and item.find('€') == -1 and item.find( '£') == -1 and item.find('%') == -1 and item.find( '¢') == -1 and item.find('~') == -1 and item.find( '+') == -1 and item.find( '/') <= 1 and item.find("'") == -1: if item.find(',') == -1: numbers.append(item) elif item.find(',') != -1 and re.findall( "^([0-9]{1,3})(,[0-9]{3})*$", item): numbers.append(item) # if len(numbers) >0: # print(numbers) fractions_list = [] for num in numbers: occur = num.count('.') if occur < 2: # not a date rmv_index = text_tokens.index(num) to_append = True no_text = True found_fractions = False if text_tokens[rmv_index].find( "/") != -1 and rmv_index - 1 > 0 and text_tokens[ rmv_index - 1].isnumeric(): # if found_fractions all_fractions = text_tokens[ rmv_index - 1] + " " + text_tokens[rmv_index] fractions_list.append(all_fractions) found_fractions = True to_append = True if rmv_index + 1 < len(text_tokens): # yes text if text_tokens[rmv_index + 1] == "million" or text_tokens[rmv_index + 1] == "Million" or \ text_tokens[rmv_index + 1] == "M" or text_tokens[rmv_index + 1] == "m" or text_tokens[rmv_index + 1] == "MILLION": if len(num) < 6: fixed_num = re.sub("[^\d\.]", "", num) # remove comma new_num = self.parse_numbers( str(float(fixed_num) * 1000000)) else: new_num = self.parse_numbers(num) no_text = False text_tokens[rmv_index + 1] = " " # remove from list text_tokens[rmv_index] = " " if text_tokens[rmv_index + 1] == "billion" or text_tokens[rmv_index + 1] == "Billion" or \ text_tokens[rmv_index + 1] == "B" or text_tokens[rmv_index + 1] == "b" or text_tokens[rmv_index + 1] == "BILLION": if len(num) < 9: fixed_num = re.sub("[^\d\.]", "", num) # remove comma new_num = self.parse_numbers( str(float(fixed_num) * 1000000000)) else: new_num = self.parse_numbers(num) no_text = False text_tokens[rmv_index + 1] = " " # remove from list text_tokens[rmv_index] = " " if text_tokens[rmv_index + 1] == "thousand" or text_tokens[rmv_index + 1] == "Thousand" or \ text_tokens[rmv_index + 1] == "K" or text_tokens[rmv_index + 1] == "k" or text_tokens[rmv_index + 1] == "THOUSAND": if len(num) < 4: fixed_num = re.sub("[^\d\.]", "", num) # remove comma new_num = self.parse_numbers( str(float(fixed_num) * 1000)) else: new_num = self.parse_numbers(num) no_text = False text_tokens[rmv_index + 1] = " " # remove from list text_tokens[rmv_index] = " " if not no_text: text_tokens[rmv_index + 1] # TODO:????????????????? if rmv_index - 1 >= 0 and text_tokens[rmv_index - 1] == '$': # yes $ if no_text: if len(num) > 3: text_tokens.append("$" + self.parse_numbers(num)) else: text_tokens.append("$" + num) text_tokens[rmv_index] = " " # remove $ from list text_tokens[rmv_index - 1] = " " else: text_tokens.append("$" + new_num) text_tokens[rmv_index - 1] = " " # remove $ from list to_append = False if to_append: # no $ if no_text: if len(num) > 3: text_tokens.append(self.parse_numbers(num)) text_tokens[ rmv_index] = " " # remove num from list else: text_tokens.append(new_num) if found_fractions: # delete fractions del text_tokens[rmv_index] del text_tokens[rmv_index - 1] """punctuations = '''!(-+—[]{};:'",)<>,./?^&*_’~|=→"”“''' # removes relevant punctuations and http and //short url index_count = 0 for word in text_tokens: to_delete = False if len(word) > 1 and word.find('-') != -1: # contains '-' text_tokens.extend(word.split('-')) text_tokens.remove(word) to_delete = True if len(word) > 1 and word.find('…') != -1: # contains '…' if to_delete == False: text_tokens.extend(word.split('…')) text_tokens.remove(word) to_delete = True if len(word) > 1 and word.find('_') != -1: # contains '_' if to_delete == False: text_tokens.extend(word.split('_')) text_tokens.remove(word) to_delete = True if len(word) > 1 and word.find('+') != -1: # contains '+' if to_delete == False: text_tokens.extend(word.split('+')) text_tokens.remove(word) to_delete = True if len(word) > 1 and word.find('/') != -1 and not (word[0] == '/' and word[1] == '/'): # contains '/' if to_delete == False: text_tokens.extend(word.split('/')) text_tokens.remove(word) to_delete = True if to_delete == False: if word in punctuations: i = text_tokens.index(word) text_tokens[i] = " " elif word == "http" or word == "https" or word == "http..." or word == "https..." or word == "RT" or word == "rt": i2 = text_tokens.index(word) text_tokens[i2] = " " elif len(word) > 1 and word[0] == '/' and word[1] == '/': i3 = text_tokens.index(word) text_tokens[i3] = " " else: text_tokens[index_count] = ''.join([i if ord(i) < 128 else '' for i in word]) index_count += 1 text_tokens[:] = [x for x in text_tokens if x != " " and x != ".." and x != "..." and x != "...." and x != "....." and x != "......" and x != "``" and x != "''" and x != "'s" and x != "'m" and x != "n't" and x != "." and x != "" and x != "'re" and x != "__" and x != "_" and x != "___" and x != "," and x != "!"]""" ############################################################################################## # find punctuations new_words = [] regex_pattern_for_num = '.*\d\.\d.*' regex_pattern_for_punctuation = 't.co.*|\'m|\'s|n\'t|\'re|\(|\)|\!|\-|\+|\[|\]|\{|\}|\;|\:|\'|\,|\<|\>|\?|\"|\^|\&|\*|\_|\~|\`|\||\=|\→|\/|\”|\“|\’|\—|\.|\``|\\\\|http.*|https.*|^RT$|^rt$' for word in text_tokens: # if term is a number in form ...d.d.. exp 230.3K - add to list if re.match(regex_pattern_for_num, word): new_words.append(word) continue # else - remove all punctuation from the term else: word = re.sub(regex_pattern_for_punctuation, '', word, flags=re.IGNORECASE) word = ''.join([i if ord(i) < 128 else '' for i in word]) if word == '' or word == ' ': continue new_words.append(word) text_tokens = new_words ############################################################################################## # find HASHTAGS # TODO: #whereIsKCR combined if "#" in text_tokens: index_list3 = [n for n, x in enumerate(text_tokens) if x == '#'] for index in index_list3: if index + 1 < len(text_tokens): if text_tokens[index + 1] != '#' and text_tokens[ index + 1][0] != '@' and text_tokens[index + 1].find( "#") == -1: #next word is not # and not @ if text_tokens[index + 1].find('_') == -1: # not contains '_' new_term = text_tokens[index] + text_tokens[index + 1] text_tokens.append(new_term) for sign in range( len(index_list3 )): # deletes all '#' and the word after it from list rmv_index = text_tokens.index('#') if rmv_index + 1 < len(text_tokens) and text_tokens[rmv_index + 1] != '#'\ and text_tokens[rmv_index + 1][0] != '@' and text_tokens[rmv_index + 1].find("#") == -1: word_val = text_tokens[rmv_index + 1] if not word_val.isupper() and not word_val.islower( ) and word_val.find('_') == -1: # split uppercase list_of_words = re.findall('[A-Z][^A-Z]*', word_val) for word in list_of_words: text_tokens.append(word) if word_val.find('_') != -1: # split '_' list_of_words = word_val.split('_') new_word = "#" for word in list_of_words: new_word += word text_tokens.append(word) # appends each word text_tokens.append(new_word) # appends #word if text_tokens[rmv_index + 1][0] != '@' and ( (not word_val.isupper() and not word_val.islower()) or word_val.islower() or (word_val.find('_') != -1)): #TODO: delete #fuck_you del text_tokens[rmv_index + 1] text_tokens.remove('#') ############################################################################################## # add fractions text_tokens.extend(fractions_list) ############################################################################################## # remove stop_words text_tokens_without_stopwords = [ w.lower() for w in text_tokens if w not in self.stop_words ] # print(text_tokens) # print(text_tokens_without_stopwords) ############################################################################################## # if stemmer to_stem = self.config.get__toStem() if to_stem: stem_text_tokens_without_stopwords = [] for token in text_tokens_without_stopwords: stem_token = Stemmer().stem_term(token) stem_text_tokens_without_stopwords.append(stem_token) #print(stem_text_tokens_without_stopwords) return stem_text_tokens_without_stopwords return text_tokens_without_stopwords
def parse_sentence(self, text): """ This function tokenize, remove stop words and apply lower case for every word within the text :param text: :return: """ after_parse = [] # tokenizer: stemmer = Stemmer() tweet_tokenizer = TweetTokenizer() text_tokens = tweet_tokenizer.tokenize( re.sub(r'[^\x00-\x7f]', r' ', text)) symbols = '.,...,:;{}()[]"*?!&$%+-_=></\'' text_tokens_without_stopwords = [ w for w in text_tokens if w.lower() not in self.stop_words and w not in symbols ] # separate - j = 0 while j < len(text_tokens_without_stopwords): if '-' in text_tokens_without_stopwords[ j] and 'http' not in text_tokens_without_stopwords[j]: if text_tokens_without_stopwords[j][0] == '-': j += 1 continue temp = text_tokens_without_stopwords[j].split('-') text_tokens_without_stopwords.remove( text_tokens_without_stopwords[j]) text_tokens_without_stopwords.insert(j, temp[0]) if temp[1] != '': text_tokens_without_stopwords.insert(j + 1, temp[1]) j += 1 j += 1 i = 0 covid = ['COVID', 'COVID19', 'CORONAVIRUS', 'CORONA'] while i < len(text_tokens_without_stopwords): # covid rule if any(covid_exp in text_tokens_without_stopwords[i].upper() for covid_exp in covid): if i < len(text_tokens_without_stopwords) - 1 and ( text_tokens_without_stopwords[i + 1] == '19' or text_tokens_without_stopwords[i + 1].upper() == 'VIRUS'): i += 1 after_parse.append('COVID19') # hashtag elif text_tokens_without_stopwords[i][0] == '#': hashtag = self.parse_hashtags(text_tokens_without_stopwords[i]) after_parse.extend(hashtag) # tagging elif text_tokens_without_stopwords[i][0] == '@': tag = self.parse_tagging(text_tokens_without_stopwords[i]) after_parse.append(tag) # url elif 'http' in text_tokens_without_stopwords[i]: url = self.parse_url(text_tokens_without_stopwords[i]) after_parse.extend(url) # percent elif (i < len(text_tokens_without_stopwords) - 2 and (text_tokens_without_stopwords[i + 1] == 'percent' or text_tokens_without_stopwords[ i + 1] == 'percentage')) or \ text_tokens_without_stopwords[i][-1] == '%': if not text_tokens_without_stopwords[i][-1] == '%': i += 1 percentage = self.parse_percentages( text_tokens_without_stopwords[i]) after_parse.append(percentage) # numbers elif text_tokens_without_stopwords[i].replace(',', '').replace( '.', '', 1).isdigit(): if '.' in text_tokens_without_stopwords[i]: curr_num = float(text_tokens_without_stopwords[i].replace( ',', '')) else: curr_num = int(text_tokens_without_stopwords[i].replace( ',', '')) if i == len( text_tokens_without_stopwords ) - 1: # if this is the last word, send only the current word number = self.parse_numbers(curr_num, '') else: number = self.parse_numbers( curr_num, text_tokens_without_stopwords[i + 1]) if text_tokens_without_stopwords[i + 1].lower() == 'thousand' or text_tokens_without_stopwords[ i + 1].lower() == \ 'million' or text_tokens_without_stopwords[i + 1].lower() == 'billion': i += 1 after_parse.append(number) elif text_tokens_without_stopwords[i].isupper(): after_parse.append(text_tokens_without_stopwords[i]) # names and entities elif text_tokens_without_stopwords[i][0].isupper(): names_and_entities = self.parse_names_and_entities( text_tokens_without_stopwords[i:]) after_parse.append(names_and_entities[0]) i += names_and_entities[1] - 1 else: after_parse.append(text_tokens_without_stopwords[i]) i += 1 # while '' in after_parse: after_parse.remove('') after_parse = [w for w in after_parse if w not in symbols or w != ''] # after_stem = [] # for token in after_parse: # after_stem.append(stemmer.stem_term(token)) # after_parse = after_stem return after_parse
else: return check_word def check_case(check_word): remove_word_list = [ "द्वारा", "बाट", "देखि", "लाई", "निम्ति", "मा", "को", "ले", "हरु" ] return [ subfunc(check_word, remove_words) if len(list(check_word[:-len(remove_words)])) >= 3 else check_word for remove_words in remove_word_list ][0] st = Stemmer() words = [st.stem(case) for case in iwords] print(words[:100]) file = open(os.path.join(VISUAL_FLD, "vocab_nep.tsv"), "w") from collections import Counter dictionary = dict() count = [('UNK', -1)] index = 0 count.extend(Counter(words).most_common(VOCAB_SIZE - 1)) for word, _ in count: dictionary[word] = index index += 1 file.write(word + '\n') index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
def pos_tag(self, sentence): stemmer = Stemmer() sent = stemmer.stem(sentence) sent = WordTokenizer(sent) tags = self.tag(sent) return tags
def __init__(self): self.stop_words = stopwords.words('english') self.dictionary_term_index = {} self.array_names_and_entities = {} self.porter_stemmer = Stemmer()
def _get_features(self, tokens, idx): stemmer = Stemmer() numbs = numbers.values() puncts = punctuations.values() token = stemmer.stem(tokens[idx]) feature_list = [] if not token: return feature_list for number in numbs: if number in list(token): feature_list.append("HAS_NUM") for punctuation in puncts: if punctuation in list(token): feature_list.append("PUNCTUATION") feature_list.append("WORD_" + token) if len(token) > 1: feature_list.append("SUF_" + token[-1:]) feature_list.append("PRE_" + token[:1]) if len(token) > 2: feature_list.append("SUF_" + token[-2:]) feature_list.append("PRE_" + token[:2]) if len(token) > 3: feature_list.append("SUF_" + token[-3:]) feature_list.append("PRE_" + token[:3]) if idx >= 1: previous_token = stemmer.stem(tokens[idx - 1]) if not previous_token: return feature_list for number in numbs: if number in list(previous_token): feature_list.append("HAS_NUM") for punctuation in puncts: if punctuation in list(previous_token): feature_list.append("PUNCTUATION") if len(previous_token) > 1: feature_list.append("SUF_" + previous_token[-1:]) feature_list.append("PRE_" + previous_token[:1]) if len(previous_token) > 2: feature_list.append("SUF_" + previous_token[-2:]) feature_list.append("PRE_" + previous_token[:2]) if len(previous_token) > 3: feature_list.append("SUF_" + previous_token[-3:]) feature_list.append("PRE_" + previous_token[:3]) feature_list.append("PREV_WORD_" + previous_token) if idx >= 2: previous_token = stemmer.stem(tokens[idx - 2]) if not previous_token: return feature_list for number in numbs: if number in list(previous_token): feature_list.append("HAS_NUM") for punctuation in puncts: if punctuation in list(previous_token): feature_list.append("PUNCTUATION") if len(previous_token) > 1: feature_list.append("SUF_" + previous_token[-1:]) feature_list.append("PRE_" + previous_token[:1]) if len(previous_token) > 2: feature_list.append("SUF_" + previous_token[-2:]) feature_list.append("PRE_" + previous_token[:2]) if len(previous_token) > 3: feature_list.append("SUF_" + previous_token[-3:]) feature_list.append("PRE_" + previous_token[:3]) feature_list.append("PREV_PREV_WORD_" + previous_token) if idx < len(tokens) - 1: next_token = stemmer.stem(tokens[idx + 1]) if not next_token: return feature_list for number in numbs: if number in list(next_token): feature_list.append("HAS_NUM") for punctuation in puncts: if punctuation in list(next_token): feature_list.append("PUNCTUATION") if len(next_token) > 1: feature_list.append("SUF_" + next_token[-1:]) feature_list.append("PRE_" + next_token[:1]) if len(next_token) > 2: feature_list.append("SUF_" + next_token[-2:]) feature_list.append("PRE_" + next_token[:2]) if len(next_token) > 3: feature_list.append("SUF_" + next_token[-3:]) feature_list.append("PRE_" + next_token[:3]) feature_list.append("NEXT_WORD_" + next_token) if idx < len(tokens) - 2: next_token = stemmer.stem(tokens[idx + 2]) if not next_token: return feature_list for number in numbs: if number in list(next_token): feature_list.append("HAS_NUM") for punctuation in puncts: if punctuation in list(next_token): feature_list.append("PUNCTUATION") if len(next_token) > 1: feature_list.append("SUF_" + next_token[-1:]) feature_list.append("PRE_" + next_token[:1]) if len(next_token) > 2: feature_list.append("SUF_" + next_token[-2:]) feature_list.append("PRE_" + next_token[:2]) if len(next_token) > 3: feature_list.append("SUF_" + next_token[-3:]) feature_list.append("PRE_" + next_token[:3]) feature_list.append("NEXT_NEXT_WORD_" + next_token) return feature_list