def treeify_tweet(tweet): tree = ParseTree(tweet['tokens'], tweet['chunk'], tweet['token_tags'], tweet['parser']) rst = [] segment = [] for idx in range(tree.get_size()): segment.append((tree.find_root(idx), tree.tokens[idx])) for idx in tree.find_all_root(): descendants = tree.find_descendants(idx) descendants += [idx] tokens = [ tree.tokens[idx] for idx in descendants if tree.tokens[idx].lower() not in nlp.getInstance().stop_list ] pos_tags = [ tree.pos_tags[idx] for idx in descendants if tree.tokens[idx].lower() not in nlp.getInstance().stop_list ] if 'V' in pos_tags or 'N' in pos_tags or '^' in pos_tags: rst.append((tokens, pos_tags, tree.tokens[idx])) # print(tokens, pos_tags, tree.tokens[idx]) return rst
def __init__(self, query, graph): ''' Constructor ''' # self.start_time # self.end_time if graph is None: self.g = Graph() self.table = TweetTable() else: self.g = graph self.table = TweetTable(graph=graph) self.nodeQueue = list() try: self.nlp = nlp.getInstance(query) except TypeError: print('failed') print(query) print(traceback.format_exc()) self.logging = False # min edge weight to keep in graph when clustering self.min_edge_weight = 3 #min count of edges in graph self.min_edges = 3 #change clustering mode self.clusteringMode = 'community'
def retrieve_phrases(self, toString=False): phrases = [] for idx in self.find_all_root(): descendants = self.find_descendants(idx) descendants += [idx] descendants = sorted(descendants) #remove stop list; descendants = [ idx for idx in descendants if self.tokens[idx].lower() not in nlp.getInstance().stop_list ] #remove by pos tagging; descendants = [ idx for idx in descendants if self.pos_tags[idx] in preserve_pos ] tokens = [self.tokens[idx] for idx in descendants] tags = [self.pos_tags[idx] for idx in descendants] if len(tokens) > 0: phrases.append(tokens) if not toString: return phrases else: phrases = ["_".join(entry) for entry in phrases] return " ".join(phrases)
def __init__(self, token, tag, index): token = nlp.getInstance().valid_token(token, tag) if token is None: return None self['token'] = token self['tag'] = tag self['index'] = index
from idlelib.IOBinding import encoding from Search import SolrSearcher from Search import TimeFunc from itertools import groupby import json import re from Basic.AhocSearch import AhocSearch from NLP.TermFold import TermFold from NLP import NLPManager import pandas from test.test_buffer import _ca stopwords = open("Data/stopwords_en.txt", encoding="utf-8").read().lower().splitlines() print(stopwords) nlp = NLPManager.getInstance() class EMTerms(object): corpus_dir = "Data/EMTerms-light.csv" tf_threshold = 1 #23 categories, so tp_theshold does not affect# tp_threshold = 30 keyword_mode = 1 # mode 2 needs pos tagging for each input tweet class __EMTerms: def __init__(self): self.term_db = {} self.code_db = {}
def treeify_tweet(tweet): tree = ParseTree(tweet['tokens'], tweet['chunk'], tweet['token_tags'], tweet['parser']) rst = [] segment = [] for idx in range(tree.get_size()): segment.append( (tree.find_root(idx), tree.tokens[idx]) ) for idx in tree.find_all_root(): descendants = tree.find_descendants(idx) descendants += [idx] tokens = [ tree.tokens[idx] for idx in descendants if tree.tokens[idx].lower() not in nlp.getInstance().stop_list ] pos_tags = [ tree.pos_tags[idx] for idx in descendants if tree.tokens[idx].lower() not in nlp.getInstance().stop_list ] if 'V' in pos_tags or 'N' in pos_tags or '^' in pos_tags: rst.append((tokens, pos_tags, tree.tokens[idx])) # print(tokens, pos_tags, tree.tokens[idx]) return rst
def retrieve_phrases(self, toString=False): phrases = [] for idx in self.find_all_root(): descendants = self.find_descendants(idx) descendants += [idx] descendants = sorted(descendants) #remove stop list; descendants = [ idx for idx in descendants if self.tokens[idx].lower() not in nlp.getInstance().stop_list ] #remove by pos tagging; descendants = [ idx for idx in descendants if self.pos_tags[idx] in preserve_pos ] tokens = [ self.tokens[idx] for idx in descendants ] tags = [ self.pos_tags[idx] for idx in descendants ] if len(tokens) > 0: phrases.append(tokens) if not toString: return phrases else: phrases = [ "_".join(entry) for entry in phrases ] return " ".join(phrases)
def _gen_forms(self, term): return nlp.getInstance().gen_forms(term, self.threshold)
bloblist = [tb(tweet) for tweet in tweets] scores = [] for i, blob in enumerate(bloblist): # print("document -->{}".format(blob)) score = {word: tfidf(word, blob, bloblist) for word in blob.words} # sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) # for word, score in sorted_words[:3]: # print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5))) scores.append(score) return scores import NLP.NLPManager as NLPManager nlp = NLPManager.NLPManager() def getTweetKeywords(tokens): tokens = [token.lower() for token in tokens] keywords = [] for token in tokens: if token in nlp.stop_list: continue if token.startswith('http'): continue keywords.append(token) return keywords