def extractKeywords(category_label, event_text): from topia.termextract import extract keywords_count = {} # training data, use first 150 entries in the dataset. for i in range(202): extractor = extract.TermExtractor() a = extractor(event_text[i]) for pair in a: if pair[0] in keywords_count: keywords_count[pair[0]] += pair[1] else: keywords_count[pair[0]] = pair[1] import operator sorted_keywords_count = sorted(keywords_count.items(), key=operator.itemgetter(1)) tmp = [] import re for sorted_keyword_count in sorted_keywords_count: if re.match('^[A-Za-z ]*$', sorted_keyword_count[0]) is not None: tmp.append(sorted_keyword_count) sorted_keywords_count = tmp total_extracted_keywords_count = len(sorted_keywords_count) # choose the most frequent 500 keywords as features keywords = sorted_keywords_count f2 = open('keywords', 'w') for keyword in keywords: f2.write(keyword[0] + "\n")
def extractorTest(): reviews = open("./crawled/concatReviews.txt").read() extractorOut = open("./taggers/extractorOut.txt", 'w+') extractor = extract.TermExtractor() # print(extractor(reviews)) for extension in re.split('-{41,}', reviews): extractorOut.write(str(extractor(str(extension))) + '\n')
def extract_word_distribution(self): extractor = extract.TermExtractor() indices = [] i = 0 for label in self.raw_labels: if label in ["raw.abstract", "title", 'raw.title']: indices.append(i) i += 1 if len(indices) > 2: indices = indices[1:] total = 0 documents_to_words = [] for paper_data in self.raw_data: paper_text = '' for index in indices: paper_text += paper_data[index] total += len(paper_data[index]) document_to_words = [] keywords = extractor(paper_text) for keyword in keywords: if keyword[2] > 3: break word_id = self.insert_word(keyword[0]) word_count = keyword[1] self.words_inverted[word_id] = keyword[0] document_to_words.append((word_id, word_count)) documents_to_words.append(document_to_words) print("EXtracted total {}".format(total)) return documents_to_words
def main(): import string import csv import re import itertools from topia.termextract import tag tagger = tag.Tagger() tagger.initialize() fp = open('Mech.txt', 'r') text = fp.read() text = ''.join(ch for ch, _ in itertools.groupby(text)) text = filter(lambda x: (x in string.printable), text) #text=text.replace('\n','.') text = re.sub('[^a-zA-Z0-9.,;:\\/\'&()]', ' ', text) print tagger.tokenize(text) print tagger(text) from topia.termextract import extract extractor = extract.TermExtractor() #extractor.filter = extract.permissiveFilter keywords = extractor(text) print keywords #print type(keywords) with open('topia_keywords.csv', 'wb') as tcsv: tcsv_write = csv.writer(tcsv) for row in sorted(keywords, key=lambda xrange: xrange[1]): tcsv_write.writerow(row)
def return_food_sentences(eatery_id): from sklearn.externals import joblib sent_tokenizer = SentenceTokenizationOnRegexOnInterjections() reviews_list = list() for post in reviews.find({"eatery_id": eatery_id}): reviews_list.extend([[sent, post.get("review_time")] for sent in sent_tokenizer.tokenize(post.get("review_text"))]) tags = TAG_CLASSIFIER_LIB.predict([e[0] for e in reviews_list]) food_sentences = list() for (sent, review_time), tag in zip(reviews_list, tags): if tag == "food": food_sentences.append([sent, review_time]) sub_tags = FOOD_SB_TAG_CLASSIFIER_LIB.predict([e[0] for e in food_sentences]) dishes_n_drinks = list() for (sent, review_time), sub_tag in zip(food_sentences, sub_tags): if sub_tag == "dishes" or sub_tag == "drinks": dishes_n_drinks.append([sent, review_time]) sentiments = SENTI_CLASSIFIER_LIB.predict([e[0] for e in dishes_n_drinks]) from topia.termextract import extract topia_extractor = extract.TermExtractor() noun_phrases = list() for (sent, review_time), tag in zip(dishes_n_drinks, sentiments): nouns = topia_extractor(sent) noun_phrases.append([tag, [e[0].lower() for e in nouns], review_time]) return (filter(lambda x: x[1], noun_phrases), [e[0] for e in dishes_n_drinks])
def __init__(self): self.extractor = extract.TermExtractor() self.extractor.filter = extract.permissiveFilter self.usable_characters = set( '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ \'' ) secret = open(secrets).readlines()[0].strip() self.client = wolframalpha.Client(secret)
def terms(url): terms = {} html = requests.get(url) content = html.content.decode("utf-8") soup = BeautifulSoup(content) #print soup.get_text() ''' for script in soup(['script','style']): script.extract text=soup.get_text().decode("utf-8") print(text) ''' [ s.extract() for s in soup( ['style', 'script', '[document]', 'head', 'title', 'select']) ] visible_text = soup.getText() #print soup.getText() print visible_text.decode f = open('haha4.txt', 'w') for i in visible_text: f.write(i.encode('utf-8')) f.close() tagger = tag.Tagger('english') tagger.initialize() # create the extractor with the tagger extractor = extract.TermExtractor(tagger=tagger) # invoke tagging the text patt = "((?: [\x00-\x7F] | [\xC0-\xDF][\x80-\xBF] | [\xE0-\xEF][\x80-\xBF]{2} | [\xF0-\xF7][\x80-\xBF]{3}){1,100})" s = nltk.data.load('haha4.txt', format='raw').lower() re.sub(patt, '', s) extractor.tagger(s) # extract all the terms, even the "weak" ones extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=1) # extract print extractor(s) result = [] for ss in extractor(s): #print ss[0] for i in ss[0].split(" "): for j in i.split("-"): if not j in result: result.append(j) print result with open("words.txt", "a") as myfile: for i in result: myfile.write(i + "\n") return result
def extract_terms(text): extractor = extract.TermExtractor() extractor.filter = extract.permissiveFilter terms = extractor(text) #return terms return [ t[0].lower() for t in terms if t[2] == 1 and MIN_TERM_LENGTH <= len(t[0]) <= MAX_TERM_LENGTH ]
def getTerms(text): terms = [] extractor = extract.TermExtractor() extractor.filter = extract.permissiveFilter for t in text: ext = extractor(t) newterms = [] for e in ext: newterms.append(e[0]) terms.append(newterms) return terms
def __init__(self, review_id, review_text, review_time, eatery_id): """ Lowering the review text """ self.review_id, self.review_text, self.review_time, self.eatery_id = review_id, \ SolveEncoding.to_unicode_or_bust(review_text.lower().replace(" \n", "")), review_time, eatery_id print self.review_time, self.review_text, self.review_id, self.eatery_id self.cuisine_name = list() self.places_names = list() self.np_extractor = extract.TermExtractor()
def extractKeywords(text): extractor = extract.TermExtractor() #inputFile = open("input.txt", 'r') #text = inputFile.read(); keywords = sorted(extractor(text)) keyPhrases = [] for tuples in keywords: keyPhrases.append(tuples[0]) return keyPhrases
def getImportaantFeatures(self): extractor = extract.TermExtractor() extractor.filter = extract.permissiveFilter key_word_for_desc = extractor(self.description) dict_important_features = {} for element in key_word_for_desc: word = stem_word(element[0]) if len(word) != 0: dict_important_features[word] = element[1] #print str(dict_important_features) return dict_important_features
def taggerTest(): reviews = open("./crawled/concatReviews.txt").read() # tagger = tag.Tagger() # tagger.initialize() # tagger.tokenize(reviews) # extract.TermExtractor(tagger) # extractor.filter = extract.permissiveFilter # # extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=2) # extracted = extractor(reviews) # printTaggedTerms(extracted) extractor = extract.TermExtractor() print(extractor(reviews))
def get_keywords(text): if text is None or "" or False: return [] extractor = extract.TermExtractor() keywords = sorted(extractor(text)) filtered_keywords = [] for keyword in keywords: if keyword[1] > 2: filtered_keywords.append(keyword[0]) return filtered_keywords
def extract_keywords(doc, lower=False): extractor = extract.TermExtractor() extractor.filter = extract.DefaultFilter() keywords_list = [] keywords = extractor(doc) for keyword in keywords: if lower == True: keywords_list.append(keyword[0].lower()) else: keywords_list.append(keyword[0]) return keywords_list
def keyterms(text, language='english'): # initialize the tagger with the required language tagger = tag.Tagger(language) tagger.initialize() # create the extractor with the tagger extractor = extract.TermExtractor(tagger=tagger) # invoke tagging the text # s = nltk.data.load('corpora/operating/td1.txt',format = 'raw') extractor.tagger(text) # extract all the terms, even the &quot;weak&quot; ones extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=1) # extract return extractor(text)
def get_terms(url): text = get_text(url) extractor = extract.TermExtractor() extractor.filter = extract.permissiveFilter #extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=2) terms = extractor(text) for t in terms: if MIN_TERM_LENGTH <= len(t[0]) <= MAX_TERM_LENGTH: txt = t[0] txt = txt.translate(title_trans) txt = txt.replace('_', ' ') txt = txt.strip().lower() if len(txt) > 2: t2 = [txt, t[1], t[2]] yield t2
def __init__(self, text): self.text = text self.conll_extractor = ConllExtractor() self.topia_extractor = extract.TermExtractor() ##Our custom tokenizer self.custom_sent_tokenizer = SentenceTokenizationOnRegexOnInterjections() self.tokenized_sentences = self.custom_sent_tokenizer.tokenize(self.text) ##This method will apply the sstemmers to the sentences self.stemming() print nltk.sent_tokenize(self.text) self.np_textblob() self.np_topia()
def POST(self): import sys import re import simplejson as json from topia.termextract import extract extractor = extract.TermExtractor() #extractor.filter = extract.permissiveFilter extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=1) def term_compare(x, y): if y[1] + y[2] * 2 > x[1] + x[2] * 2: return 1 elif y[1] == x[1] and y[2] == x[2]: return 0 else: # x<y return -1 input = web.input(callback=None) content = input.context.lower() content = content.replace(u"\u201c", '"').replace(u"\u201d", '"').replace( u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u2026", "") list = sorted(extractor(content), term_compare) list = list[:50] for i in range(len(list) - 1, -1, -1): if len(list[i][0]) == 1 or list[i][2] > 2 or ( list[i][0].find("http") >= 0) or not re.search( '[a-z]', list[i][0]) or re.search('[0-9]', list[i][0]): list.remove(list[i]) else: # prepend /tags/ to match expected input on server list[i] = list[i][0].strip() callback = input.callback pattern = r'[^a-zA-Z0-9 ]' for i in range(len(list) - 1, -1, -1): if re.search(pattern, list[i]): list.remove(list[i]) if (len(sys.argv) > 2): length = int(sys.argv[2]) if (len(list) > length): list = list[:length] list = json.dumps(list, indent=4) if callback and re.match('^[a-zA-Z0-9._\[\]]+$', callback): return callback + '(' + list + ')' else: return list
def extract_terms(text): """ Use topia.termextract to perform a simple tag extraction from user comments. """ extractor = extract.TermExtractor() # Use permissive filter to find all possibly relevant terms in short texts. extractor.filter = extract.permissiveFilter terms = extractor(text) # Collect terms in lower case, but only the ones that consist of single # words (t[2] == 1), and are at most 25 chars long. return [ t[0].lower() for t in terms if t[2] == 1 and settings.MIN_TERM_LENGTH <= len(t[0]) <= settings.MAX_TERM_LENGTH ]
def buildX(event_text, keywords): import pandas as pd from topia.termextract import extract X = [] for i in range(len(event_text)): text = event_text[i] x = {} for keyword in keywords: x[keyword] = 0 extractor = extract.TermExtractor() pairs = extractor(event_text[i]) for pair in pairs: if pair[0] in keywords: x[pair[0]] += pair[1] X.append(x) return pd.DataFrame(X)
def get_keywords(input_text): input_text = input_text.lower() stop_words = stopwords.words('english') remove = '|'.join(stop_words) regex = re.compile(r'\b(' + remove + r')\b', flags=re.IGNORECASE) input_text = regex.sub("", input_text) keyword_set = set() extractor = extract.TermExtractor() for x in sorted(extractor(input_text)): words = re.sub('[^0-9a-zA-Z@#]+', ' ', x[0]).split() for word in words: keyword_set.add(word) return keyword_set
def terms(url): terms = {} url = "http://www." + url html = requests.get(url) content = html.content.decode("utf-8") soup = BeautifulSoup(content, "lxml") ''' for script in soup(['script','style']): script.extract text=soup.get_text().decode("utf-8") print(text) ''' [ s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title']) ] visible_text = soup.getText() #print visible_text.decode f = open('haha4.txt', 'w') f2 = open('keys', 'a') for i in visible_text: f.write(i.encode('utf-8')) if not i in terms: terms[i] = 1 else: terms[i] = terms[i] + 1 #print "yees" pickle.dump(terms, f2) f2.close() f.close() tagger = tag.Tagger('english') tagger.initialize() # create the extractor with the tagger extractor = extract.TermExtractor(tagger=tagger) # invoke tagging the text s = nltk.data.load('haha4.txt', format='raw') extractor.tagger(s) # extract all the terms, even the &quot;weak&quot; ones extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=1) # extract #print extractor(s) return terms
def topia(self): ### extract terms using topia from topia.termextract import extract extractor = extract.TermExtractor() sentences = eval(segmentor.segment(text,1))['sentences'][:-5] terms = sorted(extractor(" ".join(sentences))) terms = [i for i in terms if filtered(i[0])] def filtered(term): if not re.search('[a-zA-Z]+', term): return False if re.search('(PM|AM)', term): return False if isVerb(term): return False return True return terms
def keywords(env, start_response): """Extracts key words and phrases from resume.""" start_response('200 OK', [('Content-Type', 'text/xml')]) try: with open('Darin_Plutchok_Resume_Taxonomist.txt') as f: text = f.read() except: raise restlite.Status, '400 Error Reading File' mycleaner = clean_text(text, [ "strip_characters", "eliminate_stopwords", "eliminate_nonwords", "normalize_tokens" ]) cleaned = mycleaner.clean() extractor = extract.TermExtractor() keywords_tuples = extractor(cleaned) doc = create_xml({'keywords': keywords_tuples}) return [doc.toxml()]
def generate(self, size): try: from topia.termextract import extract except ImportError: raise CommandError("topia.termextract library required") extractor = extract.TermExtractor() extractor.filter = extract.permissiveFilter titles = Link.objects.values_list("title", flat=True) tags = extractor(" ".join(titles)) tags.sort(key=lambda tag: tag[1], reverse=True) def valid_tag(tag): def valid_char(char): return not (char in punctuation or char.isdigit()) return filter(valid_char, slugify(tag[0])) for tag in filter(valid_tag, tags)[:size]: print("Creating keyword %s" % tag[0]) Keyword.objects.get_or_create(title=tag[0])
def main(): try: # list of index terms index_list = list() # init tagging tagger = tag.Tagger() tagger.initialize() extractor = extract.TermExtractor(tagger) #extractor.filter = extract.permissiveFilter #extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=2) # get file path; you may need to customize this p = os.path.join('*.docx') # go through files for infile in glob.glob(p): # open document doc = Document(os.getcwd() + os.sep + infile) print os.getcwd() + os.sep + infile # get text from Word document text = getdocumenttext(doc) # tagging l = extractor(text) for item in l: if item[0] not in index_list: index_list.append(item[0]) # close Word document del doc file = codecs.open(os.getcwd() + os.sep + 'all_concordances.tsv', 'w', 'utf8') for row in sorted(index_list): file.write(row + '\t\n') file.close() finally: print "Done!"
def __init__(self, list_of_sentences, default_np_extractor=None, regexp_grammer=None, if_postagged=False): """ Args: list_of_sentences: A list of lists with each element is a list of sentences which is pos tagged Example: [[('I', 'PRP'), ('went', 'VBD'), ('there', 'RB'), ('for', 'IN'), ('phirni', 'NN')], [], [], ...] default_np_extractor: if a list been passed then the noun phrases from various np_extractors will be appended if a string is passed, only the noun phrases from that np extractor will be appended Options regex_np_extractor regex_textblob_conll_np textblob_np_conll textblob_np_base """ self.if_postagged = if_postagged self.noun_phrases = list() self.conll_extractor = ConllExtractor() self.topia_extractor = extract.TermExtractor() self.list_of_sentences = list_of_sentences self.np_extractor = ( "textblob_np_conll", default_np_extractor)[default_np_extractor != None] if not regexp_grammer: self.regexp_grammer = r"CustomNounP:{<JJ|VB|FW|VBN>?<NN.*>*<NN.*>}" eval("self.{0}()".format(self.np_extractor)) self.noun_phrases = {self.np_extractor: self.noun_phrases} return
def main(): try: # list of index terms index_list = list() # init tagging tagger = tag.Tagger() tagger.initialize() extractor = extract.TermExtractor(tagger) #extractor.filter = extract.permissiveFilter #extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=2) # get file path p = os.path.join('final.ms'+ os.sep, '*chapter*.docx') # you may need to customize this # go through files for infile in glob.glob(p): # open document doc = Document(os.getcwd()+'\\'+infile) print os.getcwd()+'\\'+infile # get text from Word document text = getdocumenttext(doc) # tagging l = extractor(text) for item in l: if item[0] not in index_list: index_list.append(item[0]) # close Word document del doc write_concordance(sorted(index_list), os.getcwd()+os.sep+'all_concordance.docx') finally: print "Done!"
def __get_category(self, document): extractor = extract.TermExtractor() extractor.filter = extract.permissiveFilter extracted_key_word = extractor(document.text) dict_category_value = {} # calculating likelihood for each category for word_object in extracted_key_word: word = word_object[0] count = word_object[1] if word in self.dict_refined_features: key_feature = self.dict_refined_features[word] for category_name in key_feature.dict_category_probability: if category_name in dict_category_value: dict_category_value[ category_name] = dict_category_value[ category_name] + count * math.log( key_feature.dict_category_probability[ category_name], 10) else: dict_category_value[category_name] = count * math.log( key_feature. dict_category_probability[category_name], 10) # calculating prior for category_name in dict_category_value: dict_category_value[ category_name] = dict_category_value[category_name] + math.log( self.dict_category[category_name].prior) max_value = -10000000000 document_category_name = "" # calculating max for category_name in dict_category_value: if dict_category_value[category_name] >= max_value: max_value = dict_category_value[category_name] document_category_name = category_name return document_category_name