def common_ngram_txt(self, tokens1, tokens2, size=15): print('Checking ngram length {}'.format(size)) ng1 = set(nltk_ngrams(tokens1, size)) ng2 = set(nltk_ngrams(tokens2, size)) match = set.intersection(ng1, ng2) print('..found {}'.format(len(match))) return match
def extract_ngram_from_text(text, n, remove_stopwords=True, remove_punc=True, mode='spacy'): """ Function that retrieves all n-grams from the input string :param text: raw string :param n: integer that tells the model to retrieve all k-gram where k<=n :param remove_stopwords: whether or not to remove stopwords from lib :param remove_punc: whether or not to remove punctuation from lib :param mode: {'spacy', 'naive'} :return ngram_counter: a counter that maps n-gram to its frequency :return tokens: a list of parsed ngrams """ tokens = tokenize(text, remove_stopwords=remove_stopwords, remove_punc=remove_punc, mode=mode) all_ngrams = [] for i in range(1, n + 1): cur_ngrams = nltk_ngrams(tokens, i) all_ngrams += cur_ngrams ngram_counter = Counter(all_ngrams) return ngram_counter, all_ngrams
def ngramize(self, o_tweet): ngram = [] for i in self.ngram_combo: ngrams = nltk_ngrams(o_tweet.split(), i) for grams in ngrams: ngram.append(" ".join(grams)) return ngram
def append_data(self, unigrams): bigrams = [bigram[0].lower()+" "+bigram[1].lower() for bigram in nltk_ngrams(unigrams, 2) if len(bigram[0]) > 1 and len(bigram[1]) > 1] self.ngrams += Counter(bigrams) self.prune(0.35) self.save()
def get_ngrams_around_anchors(n, words, anchors): """ Generate ngrams only around certain words (anchors). """ all_ngrams = [] for anchor in anchors: for i in anchor.get_all_occurrences(): start_index = max(0, i - n + 1) piece = words[start_index:min(i + n, len(words))] ngrams = enumerate(nltk_ngrams(piece, n), start=start_index) all_ngrams.extend(ngrams) return all_ngrams
def get_ngrams_around_anchors(n, words, anchors): """ Generate ngrams only around certain words (anchors). """ all_ngrams = [] for anchor in anchors: for i in anchor.get_all_occurrences(): start_index = max(0, i - n + 1) piece = words[start_index : min(i + n, len(words))] ngrams = enumerate(nltk_ngrams(piece, n), start=start_index) all_ngrams.extend(ngrams) return all_ngrams
def append_data(self, unigrams): bigrams = [ bigram[0].lower() + " " + bigram[1].lower() for bigram in nltk_ngrams(unigrams, 2) if len(bigram[0]) > 1 and len(bigram[1]) > 1 ] self.ngrams += Counter(bigrams) self.prune(0.35) self.save()
def transform(self, documents): repeated = [] mirror = [] repeated_phrase = [] for doc in documents: repeated_count = 0 mirror_count = 0 repeated_phrase_count = 0 stress_phrase_sents = doc.stress_markers # print(stress_phrase_sents) for i in range(0, len(stress_phrase_sents)): if i == 1 and stress_phrase_sents[i] == stress_phrase_sents[ i - 1] and len(stress_phrase_sents[i]) == 1: repeated_count += 1 else: phrases_in_sent = stress_phrase_sents[i] # print(phrases_in_sent) phrase_combinations = list(nltk_ngrams( phrases_in_sent, 2)) + list( nltk_ngrams(phrases_in_sent, 3)) for pc in phrase_combinations: str = "".join(pc) if str == str[::-1]: # print("mirror", str) mirror_count += 1 str_length = len(str) first_half = str[:str_length // 2] second_half = str[str_length // 2:] if str_length % 2 == 0 else str[ str_length // 2 + 1:] if first_half == second_half: # print("rp", str) repeated_phrase_count += 1 repeated.append(repeated_count) mirror.append(mirror_count) repeated_phrase.append(repeated_phrase_count) X = np.array([repeated, mirror, repeated_phrase]).T return X
def ngrams(self, n, min_repetitions=2): """Returns dictionary of ngrams repeated more than 2 times Args: n (int): n in ngram Returns: dict: dictionary of ngrams """ # get the dict ngram_dict = dict(Counter(nltk_ngrams(self.filtered_text.split(), n))) # remove ngrams with only one ocurrence ngram_dict = { key: value for key, value in ngram_dict.items() if value >= min_repetitions } return ngram_dict
def semanticize(self, sentence, normalize_dash=True, normalize_accents=True, normalize_lower=False, translations=True, counts=False, sense_probability_threshold=None): if sense_probability_threshold == None: sense_probability_threshold = self.sense_probability_threshold result = {"links": []} ngrams = set() token_lists = [tokenize(sentence), tokenize(sentence.replace('-', ' ')), tokenize(sentence.replace('.', ' ')), tokenize(sentence.replace('.', ''))] # get all ngrams for this sentence, limit to max_ngram_length # if applicable for token_list in token_lists: max_len = len(token_list) + 1 if self.max_ngram_length is not None: max_len = min(max_len, self.max_ngram_length) for n in range(1, max_len): for ngram in nltk_ngrams(token_list, n): ngrams.add(' '.join(ngram)) normal_ngrams = map(wpmutil.normalize, ngrams) exist = self.wpm.normalized_entities_exist(normal_ngrams) for i, (ngram, normal_ngram) in enumerate(zip(ngrams, normal_ngrams)): if exist[i]: normalized_ngram = wpmutil.normalize(ngram, normalize_dash, normalize_accents, normalize_lower) anchors = self.wpm.get_all_entities(normal_ngram) for anchor in anchors: normalized_anchor = wpmutil.normalize(anchor, normalize_dash, normalize_accents, normalize_lower) if normalized_ngram == normalized_anchor: if self.debug and not self.wpm.entity_exists(anchor): raise LookupError("Data corrupted, cannot " + "find %s in the database" \ % anchor) entity = self.wpm.get_entity_data(anchor) for sense in entity['senses']: sense_str = str(sense) sense_data = self.wpm.get_sense_data(anchor, sense_str) if sense_data: if entity['cnttextocc'] == 0: link_probability = 0 sense_probability = 0 else: link_probability = float(entity['cntlinkdoc']) / entity['cnttextdoc'] sense_probability = float(sense_data['cntlinkdoc']) / entity['cnttextdoc'] if sense_probability > sense_probability_threshold: title = unicode(self.wpm.get_item_title(sense_str)) url = self.wikipedia_url_template \ % (self.language_code, urllib.quote(title.encode('utf-8'))) if entity['cntlinkocc'] == 0: prior_probability = 0 else: prior_probability = float(sense_data['cntlinkocc']) / entity['cntlinkocc'] link = { "label": anchor, "text": ngram, "title": title, "id": sense, "url": url, "linkProbability": link_probability, "senseProbability": sense_probability, "priorProbability": prior_probability } if translations: link["translations"] = {self.language_code: {"title": title, "url": url}} if self.wpm.sense_has_trnsl(sense_str): for lang in self.wpm.get_trnsl_langs(sense_str): trnsl = self.wpm.get_sense_trnsl(sense_str, lang) link["translations"][lang] = { 'title': unicode(trnsl), 'url': self.wikipedia_url_template % (lang, urllib.quote(unicode(trnsl).encode('utf-8'))) } if counts: link["occCount"] = entity['cnttextocc'] link["docCount"] = entity['cnttextdoc'] link["linkOccCount"] = entity['cntlinkocc'] link["linkDocCount"] = entity['cntlinkdoc'] link["senseOccCount"] = int(sense_data['cntlinkocc']) link["senseDocCount"] = int(sense_data['cntlinkdoc']) link['fromTitle'] = sense_data['from_title'] link['fromRedirect'] = sense_data['from_redir'] result["links"].append(link) return result
def semanticize(self, sentence, normalize_dash=True, normalize_accents=True, normalize_lower=False, translations=True, counts=False, sense_probability_threshold=None): if sense_probability_threshold == None: sense_probability_threshold = self.sense_probability_threshold result = {"links": []} ngrams = set() token_lists = [ tokenize(sentence), tokenize(sentence.replace('-', ' ')), tokenize(sentence.replace('.', ' ')), tokenize(sentence.replace('.', '')) ] # get all ngrams for this sentence, limit to max_ngram_length # if applicable for token_list in token_lists: max_len = len(token_list) + 1 if self.max_ngram_length is not None: max_len = min(max_len, self.max_ngram_length) for n in range(1, max_len): for ngram in nltk_ngrams(token_list, n): ngrams.add(' '.join(ngram)) normal_ngrams = map(wpmutil.normalize, ngrams) exist = self.wpm.normalized_entities_exist(normal_ngrams) for i, (ngram, normal_ngram) in enumerate(zip(ngrams, normal_ngrams)): if exist[i]: normalized_ngram = wpmutil.normalize(ngram, normalize_dash, normalize_accents, normalize_lower) anchors = self.wpm.get_all_entities(normal_ngram) for anchor in anchors: normalized_anchor = wpmutil.normalize( anchor, normalize_dash, normalize_accents, normalize_lower) if normalized_ngram == normalized_anchor: if self.debug and not self.wpm.entity_exists(anchor): raise LookupError("Data corrupted, cannot " + "find %s in the database" \ % anchor) entity = self.wpm.get_entity_data(anchor) for sense in entity['senses']: sense_str = str(sense) sense_data = self.wpm.get_sense_data( anchor, sense_str) if sense_data: if entity['cnttextocc'] == 0: link_probability = 0 sense_probability = 0 else: link_probability = float( entity['cntlinkdoc'] ) / entity['cnttextdoc'] sense_probability = float( sense_data['cntlinkdoc'] ) / entity['cnttextdoc'] if sense_probability > sense_probability_threshold: title = unicode( self.wpm.get_item_title(sense_str)) url = self.wikipedia_url_template \ % (self.language_code, urllib.quote(title.encode('utf-8'))) if entity['cntlinkocc'] == 0: prior_probability = 0 else: prior_probability = float( sense_data['cntlinkocc'] ) / entity['cntlinkocc'] link = { "label": anchor, "text": ngram, "title": title, "id": sense, "url": url, "linkProbability": link_probability, "senseProbability": sense_probability, "priorProbability": prior_probability } if translations: link["translations"] = { self.language_code: { "title": title, "url": url } } if self.wpm.sense_has_trnsl(sense_str): for lang in self.wpm.get_trnsl_langs( sense_str): trnsl = self.wpm.get_sense_trnsl( sense_str, lang) link["translations"][lang] = { 'title': unicode(trnsl), 'url': self.wikipedia_url_template % (lang, urllib.quote( unicode(trnsl).encode( 'utf-8'))) } if counts: link["occCount"] = entity['cnttextocc'] link["docCount"] = entity['cnttextdoc'] link["linkOccCount"] = entity[ 'cntlinkocc'] link["linkDocCount"] = entity[ 'cntlinkdoc'] link["senseOccCount"] = int( sense_data['cntlinkocc']) link["senseDocCount"] = int( sense_data['cntlinkdoc']) link['fromTitle'] = sense_data[ 'from_title'] link['fromRedirect'] = sense_data[ 'from_redir'] result["links"].append(link) return result
def get_all_ngrams(n, words): """ Generate all possible engrams from a text and enumerate them. """ return enumerate(nltk_ngrams(words, n)) # not necessarily a list
def get_ngrams(unigrams, orders=[1, 2]): all_ngrams = itertools.chain(*map(lambda n: list(nltk_ngrams(unigrams, n)), orders)) return(set(all_ngrams))
else: print("Loading effect list for words") f = open("data/effect_list.pkl", "rb") effect_list = pickle.load(f) f.close() if not path.exists("data/{}grams.pkl".format(NGRAM)) or args.rebuild: print("Building {}-grams".format(NGRAM)) ngrams = [] for s in sentences: g = list( nltk_ngrams(s.split(), NGRAM, pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>")) ngrams.extend(g) # TODO: Filter out the least frequent words ngrams_count = dict(Counter(ngrams).viewitems()) total_count = sum(ngrams_count.values()) ngrams = dict([(g, float(ngrams_count[g]) / total_count) for g in ngrams_count.keys()]) f = open("./data/{}grams.pkl".format(NGRAM), "wb") pickle.dump(ngrams, f) f.close()