def remove_stop_words(self,frequency_list): stop_words = [] with open("stop_words_list.txt") as stop_words_list: stop_words_data = stop_words_list.read() stop_words = stop_words_data.split(',') temp_list = [] stemmer = Porter2Stemmer() try: for c in string.punctuation: frequency_list= frequency_list.replace(c,"") querywords = frequency_list.split() resultwords = [stemmer.stem(word) for word in querywords if word.lower() not in stop_words] result = ' '.join(resultwords) except ValueError: print "no such value" #print(result) return result
def __init__(self): self.stopwords = [] self.stemmer = Porter2Stemmer() with open('stopwords.txt') as my_file: for line in my_file: self.stopwords.append(self.stemmer.stem(line[:-1])) self.ignorechars = ''',:'!()'''
def near(self, index, first_term, second_term, k): doc_list = set() # stemming words first, can remove this later stemmer = Porter2Stemmer() first_term = stemmer.stem(first_term) second_term = stemmer.stem(second_term) # Max number of iterations is the max size of the bigger list max_length = max(len(index.get_postings(first_term)), len(index.get_postings(second_term))) f_postings_list = index.get_postings(first_term) s_postings_list = index.get_postings(second_term) i = 0 j = 0 # both_set = index.get_all_doc_ids(first_term).intersection(index.get_all_doc_ids(second_term)) # the maximum number of times to iterate is the max length of the list while 1: if i + 1 > len(f_postings_list) or j + 1 > len(s_postings_list): return doc_list if f_postings_list[i].get_document_id( ) == s_postings_list[j].get_document_id(): f_pos_list = f_postings_list[i].get_positions() s_pos_list = s_postings_list[j].get_positions() # for any position that is less that the first list, get rid of it # the only positions that matter are second positions after the first pos s_pos_list = list( filter(lambda p: p > f_pos_list[0], s_pos_list)) # second_pos - first_pos # we an return true for the first instance of true near for second_pos in s_pos_list: # find the distances between second word and first distances = list( map( lambda first_pos: ((second_pos - first_pos <= k) and second_pos > first_pos), f_pos_list)) if any(list(map(lambda p: p <= k, distances))): doc_list.add(f_postings_list[i].get_document_id()) break i += 1 j += 1 else: # increment as needed i += int((f_postings_list[i].get_document_id() < s_postings_list[j].get_document_id())) j += int((f_postings_list[i].get_document_id() > s_postings_list[j].get_document_id()))
def write_index_to_disk(self, index): """ Writes to disk whatever index dictionary that is given, slow as hell though :param index: Index gets passed a dictionary :return: """ stem = Porter2Stemmer() vb = vbe() position_term_db = position_db('/Users/Cemo/Documents/cecs429/search_engine/DB/term_positions_federalists.db') position_term_db.create_table() current_index = index sorted_key_list = sorted(index) index_binary_file = open('index_federalists.bin', 'wb') for key in sorted_key_list: if not key: continue position_term_db.add_term(stem.stem(key.lower()), index_binary_file.tell()) disk_write_list = [] df = len(current_index[key]) for number in vb.encode_number(df): index_binary_file.write(pack(">B", number)) postings = current_index[key] for i in range(len(current_index[key])): # TODO gaps seems to be working if i == 0: doc_id = postings[i].get_document_id() else: doc_id = postings[i].get_document_id() - postings[i-1].get_document_id() disk_write_list.append(doc_id) for number in vb.encode_number(doc_id): index_binary_file.write(pack(">B", number)) tf = postings[i].positions_list disk_write_list.append(len(tf)) for number in vb.encode_number(len(tf)): index_binary_file.write(pack(">B", number)) for j in range(len(tf)): # TODO gaps seems to be working if j == 0: disk_write_list.append(tf[j]) vb.encode_number(tf[j]) for number in vb.encode_number(tf[j]): index_binary_file.write(pack(">B", number)) else: disk_write_list.append(tf[j] - tf[j - 1]) for number in vb.encode_number(tf[j] - tf[j - 1]): index_binary_file.write(pack(">B", number)) position_term_db.close_connection_commit() index_binary_file.close()
def stop_stem(self, data): stop = open( "/home/lalit/Desktop/IR_LAB/Final_Index/Eng_code/stop_eng.txt" ).read().split("\n") stemwords = [] stemmer = Porter2Stemmer() for word in data: if word not in stop: stemwords.append(stemmer.stem(word)) l = list(stemwords) return l
def stem(self, list_of_words): # returns a list of stemmed words stemmed_list = [] stemmer = Porter2Stemmer() for word in list_of_words: stemmed_word = stemmer.stem(word.lower()) stemmed_list.append(stemmed_word) return stemmed_list
def remove_stop_words(self, frequency_list): stop_words = [] #stop_words = get_stop_words('en') with open("stop_words_list.txt") as stop_words_list: #stop_words = [line.split(',') for line in stop_words_list if line.strip()] stop_words_data = stop_words_list.read() stop_words = stop_words_data.split(',') #frequency_list=frequency_list.split(" ") temp_list = [] stemmer = Porter2Stemmer() #print(frequency_list) #stop_words.append(stop_words_data) #print(stop_words) try: '''for key in frequency_list: #print(key) if key.lower() not in stop_words: #print (key) #key = self.remove_custom_stop_words(key) temp_list.append([key]) else: print("Removing:"+str(key))''' #frequency_list = ' '.join(e for e in frequency_list if e.isalnum()) for c in string.punctuation: frequency_list = frequency_list.replace(c, "") #print(frequency_list) querywords = frequency_list.split() #querywords = ' '.join(e for e in frequency_list if e.isalnum()) #print(querywords) #for stemwords in querywords: #print(stemwords,stemmer.stem(stemwords)) resultwords = [ stemmer.stem(word) for word in querywords if word.lower() not in stop_words ] result = ' '.join(resultwords) #print result except ValueError: print "no such value" #print(result) return result
def index_file(file_name, documentID): stemmer = Porter2Stemmer() punctuation = str.maketrans(dict.fromkeys(string.punctuation)) with open(file_name) as json_file: article_data = json.load(json_file) body = (article_data['body']).lower().translate(punctuation).split(' ') body = list(filter(lambda w: w != '', map(lambda s: s.strip(), body))) term_positions = find_positions(body) for key in term_positions: index.add_term(key, documentID, term_positions[key]) stemmed_term = stemmer.stem(key) if (stemmed_term != key and not stemmed_term in index.m_index): index.add_term(stemmer.stem(key), documentID, term_positions[key])
def link_promise(self, link): promise = 0.0 stemmer = Porter2Stemmer() if link is None: return promise try: #get terms in the link link_terms = re.findall("\w+", link.lower()) link_terms = [stemmer.stem(term) for term in link_terms] except: print("link error") return promise #get terms in the query link_terms_count = collections.Counter(link_terms) self.query_terms = [stemmer.stem(term) for term in self.query_terms] #calculate promise for term in self.query_terms: if term in link_terms_count: promise = promise + 0.1 * link_terms_count[term] return promise
def wild(word_input): kg = kgram_index() w = wildcard() stemmer = Porter2Stemmer() ktokens = [] wildcard_tokens = w.wildcard_parser(word_input) for token in wildcard_tokens: k = 0 if len(token) > 3: k = 3 else: k = len(token) ktokens.extend(kg.create_kgram(token, k)) # remove '$' from tokens ktokens[:] = [x for x in ktokens if x != '$'] canidate_lists = [] for token in ktokens: if token in vocab: canidate_lists.append(vocab[token]) print(token, list(vocab[token]), len(vocab)) intersected_list = list( set(canidate_lists[0].intersection(*canidate_lists[1:]))) n = list(map(lambda t: stemmer.stem(t), intersected_list)) n = list(map(lambda t: index.get_index()[t], n)) doc_list = [] for p_list in n: for post in p_list: doc_list.append(post.get_document_id()) # return list of docs for the word found return doc_list
def preprocess(raw_text): lang = detect(raw_text) # 1. keep only words letters_only_text = raw_text # 2. convert to lower case and split words = letters_only_text.lower().split() # 3. remove \n break_free_words = words # [word.rstrip("\n") for word in words] # 5. lemmatize lemmatized_words = [] print(break_free_words) if(lang == 'ru'): m = Mystem() for word in break_free_words: a = m.lemmatize(word) lemmatized_words.append(a[0]) else: stemmer = Porter2Stemmer() lemmatized_words = [stemmer.stem(word) for word in break_free_words] final = [] for i in lemmatized_words: final.append(i) return final
def Cleaner(text): # import/download relevant packages import string import nltk nltk.download('punkt') nltk.download('stopwords') # split into words from nltk.tokenize import word_tokenize tokens = word_tokenize(text) # convert to lower case tokens = [w.lower() for w in tokens] # remove punctuation from each word table = str.maketrans('', '', string.punctuation) stripped = [w.translate(table) for w in tokens] # remove remaining tokens that are not alphabetic words = [word for word in stripped if word.isalpha()] # filter out stop words from nltk.corpus import stopwords stop_words = set(stopwords.words('english')) words = [w for w in words if not w in stop_words] # stemming of words from porter2stemmer import Porter2Stemmer stemmer = Porter2Stemmer() stemmed = [stemmer.stem(word) for word in words] # final output global clean_text clean_text = stemmed return clean_text
class IndexModule: data_list = None data_n = None stop_words = set() # set of stop words stemmer = Porter2Stemmer() # init porter stemmer config_path = None config = None conn = None def __init__(self): """ init config, stop words, data list """ self.config_path = 'config.ini' self.config = configparser.ConfigParser() self.config.read(self.config_path, 'utf-8') with open(self.config['DEFAULT']['STOPWORDS_PATH'], encoding='utf-8') as f: self.stop_words = set(f.read().split()) self.conn = sqlite3.connect(self.config['DEFAULT']['SE_DB_PATH']) self.conn.row_factory = sqlite3.Row c = self.conn.cursor() self.data_list = c.execute('select * from recipes').fetchall() self.data_n = len(self.data_list) # write DATA_N to config self.config.set('DEFAULT', 'DATA_N', str(self.data_n)) with open(self.config_path, 'w', encoding='utf-8') as f: self.config.write(f) def __del__(self): """ close the database :return: """ self.conn.close() def write_index_to_db(self, index, table_name): """ write inverted index to db index in form of # form: {term: [df, [posting, ...]], ...} :param table_name: :param index: :return: """ conn = sqlite3.connect(self.config['DEFAULT']['SE_DB_PATH']) c = conn.cursor() c.execute('drop table if exists %s' % table_name) c.execute( 'create table %s (term text primary key, df integer, postings text)' % table_name) for key, value in index.items(): posting_list = '\n'.join(map(str, value[1])) values = (key, value[0], posting_list) c.execute('insert into %s values (?, ?, ?)' % table_name, values) conn.commit() conn.close() def data_cleanup_tf(self, data): """ clean data and construct tf dictionary :param data: :return: length of data and tf dictionary """ tf_dict = {} # {term: tf, ...} n = 0 # length of data terms = data.lower().split() # lower the data and split for term in terms: # filter stop words having quotation marks # filter sites in a simple way if (term not in self.stop_words) and ('http' not in term) and ( 'www' not in term): term = re.sub(r'[^a-z]', '', term) # remove non-alphabetic letters # filter stop words again and blank term if (term not in self.stop_words) and (len(term) != 0): term = self.stemmer.stem(term) # stemming n += 1 if term in tf_dict: tf_dict[term] += 1 else: tf_dict[term] = 1 return n, tf_dict def construct_index_name_desc_ing(self): """ construct inverted index with name and description :return: """ inverted_index = {} # form: {term: [df, [posting, ...]], ...} AVG_LEN = 0 # average length for name and description for recipe in self.data_list: rid = recipe['id'] name = recipe['name'] description = recipe['description'] ingredients = recipe['ingredients'] length, term_tf = self.data_cleanup_tf(name + ' ' + description + ' ' + ingredients) AVG_LEN += length for term, tf in term_tf.items(): posting = Posting(rid, tf, length) if term in inverted_index: inverted_index[term][0] += 1 # df++ inverted_index[term][1].append( posting) # add posting to list else: inverted_index[term] = [1, [posting]] # [df, [posting]] AVG_LEN /= self.data_n # print(len(inverted_index), inverted_index) # write AVG_LEN to config self.config.set('DEFAULT', 'AVG_LEN', str(AVG_LEN)) with open(self.config_path, 'w', encoding='utf-8') as f: self.config.write(f) self.write_index_to_db(inverted_index, 'index_name_desc_ing') def construct_index_name(self): """ construct inverted index with name :return: """ inverted_index = {} # form: {term: [df, [posting, ...]], ...} for recipe in self.data_list: rid = recipe['id'] name = recipe['name'] length, term_tf = self.data_cleanup_tf(name) for term, tf in term_tf.items(): posting = Posting(rid, tf, length) if term in inverted_index: inverted_index[term][0] += 1 # df++ inverted_index[term][1].append( posting) # add posting to list else: inverted_index[term] = [1, [posting]] # [df, [posting]] # print(len(inverted_index), inverted_index) self.write_index_to_db(inverted_index, 'index_name') def construct_index_ingredient(self): """ construct inverted index with ingredient :return: """ inverted_index = {} # form: {term: [df, [posting, ...]], ...} for recipe in self.data_list: rid = recipe['id'] ing = recipe['ingredients'] length, term_tf = self.data_cleanup_tf(ing) for term, tf in term_tf.items(): posting = Posting(rid, tf, length) if term in inverted_index: inverted_index[term][0] += 1 # df++ inverted_index[term][1].append( posting) # add posting to list else: inverted_index[term] = [1, [posting]] # [df, [posting]] self.write_index_to_db(inverted_index, 'index_ingredient')
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import TruncatedSVD from textblob import TextBlob as tb import os import networkx from nltk.stem.porter import * import scipy.stats import statistics from statistics import mean from statistics import stdev import numpy as np from nltk import ngrams from os import walk ps = Porter2Stemmer() import collections import time from nltk import ngrams from nltk.tokenize import RegexpTokenizer stop_words = set(stopwords.words('english')) # In[185]: class Node: def __init__(self, author_name, count): #creat constructor self.AuthorName = author_name self.Count = count self.Impact = 0
def index_file(directory, file_name, documentID, index): stemmer = Porter2Stemmer() # Dealing with punctuation p = dict.fromkeys(string.punctuation) p.pop('-') # we need to deal with hyphens weight_map = {} try: with open(directory + file_name) as txt_file: # Trying to normalize the vocab, getting rid of non alphanumeric, body = txt_file.read().replace('\n', '').lower() body = re.sub(r'[^A-Za-z0-9#]+', ' ', body) body = body.split( ' ') # Gets rid of any \n that appear in the text body = list( filter(lambda t: t != '' and t != '-', body)) # remove single spaces and single hyphens position = 0 for term in body: # take care of hyphenated words if '-' in term: unhyphenated_word = term.replace('-', '') index.add_term(stemmer.stem(unhyphenated_word), documentID, position) hyphened_tokens = term.split('-') for t in hyphened_tokens: all_docs_index.add_term(stemmer.stem(t), documentID, position) else: index.add_term(stemmer.stem(term), documentID, position) position += 1 if term not in weight_map: weight_map[term] = 1 else: weight_map[term] = weight_map[term] + 1 except FileNotFoundError as e: print(e) doc_total_tf[ file_name] = weight_map # given a document, it will return a map of that docs tf score_map = {} wdt = 0 # Gets the Wdt's of the terms in the file for tf in weight_map: score = pow(1 + log(weight_map[tf]), 2) score_map[tf] = score wdt += score**2 Ld = sqrt(wdt) length = 0 for tf in score_map: score_map[tf] = score_map[tf] / Ld length += score_map[tf]**2 doc_wdt[file_name] = score_map # Things to turn in for Neal, it's just the easiest place to put this if file_name == 'paper_52.txt': print('First 30 components of document 52') get_first_thirty(score_map, True)
class crToPG(object): stemmer = Porter2Stemmer() #remove stop words def remove_stop_words(self, frequency_list): stop_words = [] #stop_words = get_stop_words('en') with open("stop_words_list.txt") as stop_words_list: #stop_words = [line.split(',') for line in stop_words_list if line.strip()] stop_words_data = stop_words_list.read() stop_words = stop_words_data.split(',') #frequency_list=frequency_list.split(" ") temp_list = [] stemmer = Porter2Stemmer() #print(frequency_list) #stop_words.append(stop_words_data) #print(stop_words) try: '''for key in frequency_list: #print(key) if key.lower() not in stop_words: #print (key) #key = self.remove_custom_stop_words(key) temp_list.append([key]) else: print("Removing:"+str(key))''' querywords = frequency_list.split() for stemwords in querywords: print(stemwords, stemmer.stem(stemwords)) resultwords = [ word for word in querywords if word.lower() not in stop_words ] result = ' '.join(resultwords) #print result except ValueError: print "no such value" #print(result) return result def remove_custom_stop_words(self, word_list): #stop_words_lst = ['yo', 'so', 'well', 'um', 'a', 'the', 'you know', 'i mean'] with open("stop_words_list.txt") as stop_words_list: stop_words_list = [ line.split('\n') for line in stop_words_list if line.strip() ] '''for word in stop_words_list: pattern = r'\b'+word[0]+r'\b' word_list = re.sub(pattern, '', word_list)''' try: if word_list not in stop_words_list: return word_list else: print("Match found:" + str(word_list)) except ValueError: print "no such value" def ingest(self, crfile, pagestack, billstack, speechstack, speechstack1): """ Break a crdoc into three parts Pass the appropriate rows for each part to the right stack for a bulk insert. """ #print(crfile) page_row = OrderedDict([('pageid', crfile['id']), ('title', rd(crfile['doc_title'])), ('chamber', crfile['header']['chamber']), ('extension', crfile['header']['extension']), ('cr_day', crfile['header']['day']), ('cr_month', crfile['header']['month']), ('cr_year', crfile['header']['year']), ('num', crfile['header']['num']), ('vol', crfile['header']['vol']), ('wkday', crfile['header']['wkday'])]) # Add the "page" level to the page stack first pagestack.add(page_row) bills = [] if 'related_bills' in list(crfile.keys()): for bill in crfile['related_bills']: bill_row = OrderedDict([('congress', bill['congress']), ('context', bill['context']), ('bill_type', bill['type']), ('bill_no', bill['number']), ('pageid', crfile['id'])]) bills.append(bill_row) # Bills for the bill god! billstack.add(bills) #speeches = [] ''' for speech in crfile['content']: if speech['kind'] == 'speech': speechid = crfile['id'] + '-' + str(speech['turn']) test = 'test string' speech_row = OrderedDict([('speechid',speechid), ('speaker',speech['speaker']), ('speaker_bioguide',speech['speaker_bioguide']), ('pageid',crfile['id']), ('text',rd(speech['text'])), ('turn',speech['turn']), ('party',test) ]) # Gotta get rid of delimiter char speeches.append(speech_row)''' speeches_republican = [] speeches_democratic = [] #speech_row_D =[] #speech_row_R =[] democratic_data_output = '' republican_data_output = '' for speech in crfile['content']: if speech['kind'] == 'speech': #speechid = crfile['id'] + '-' + str(speech['turn']) #test = 'anannya' #print(speech) import json #print(speech['speaker_bioguide']) #print(rd(speech['text'])) v = str(speech['speaker_bioguide']) + "||" + str( rd(speech['text'])) '''with open('speeches_test','a+') as out_json: json.dump(v,out_json)''' if speech['speaker_bioguide']: keybioguideid = speech['speaker_bioguide'] outpath = os.path.join('', 'json', keybioguideid + '.json') #print(outpath) #outpath = 'json\\'+keybioguideid+'.json' with open(outpath) as json_data: d = json.load(json_data) #print(rd(speech['text'])) #print('*****************************************************************************************************************************************') #print(speech_remove_sort_words) if d['party'] == 'D': speech_row_D = [] speech_remove_stop_words = [] speech_remove_stop_words = self.remove_stop_words( rd(speech['text'])) #print(d['party']) current_speaker_data = '\n\n' + speech_remove_stop_words democratic_data_output = democratic_data_output + current_speaker_data + '\n' speech_row_D = OrderedDict([ #('speechid',speechid), ('affiliation', 'Affiliation:' + d['party']), ('speaker', speech['speaker']), #('speaker_bioguide',speech['speaker_bioguide']), #('pageid',crfile['id']), ('text', speech_remove_stop_words), #('turn',speech['turn']) ]) speeches_democratic.append(speech_row_D) '''if len(speech_remove_stop_words): #print(speech_remove_stop_words) speeches_democratic.append(speech_row_D) else: pass #print(str(keybioguideid) + "D")''' elif d['party'] == 'R': speech_row_D = [] speech_remove_stop_words = [] speech_remove_stop_words = self.remove_stop_words( rd(speech['text'])) #print(speech_remove_stop_words,stemmer.stem(speech_remove_stop_words)) current_speaker_data = '\n\n' + speech_remove_stop_words republican_data_output = republican_data_output + current_speaker_data + '\n' speech_row_R = OrderedDict([ #('speechid',speechid), ('affiliation', 'Affiliation:' + d['party']), ('speaker', speech['speaker']), #('speaker_bioguide',speech['speaker_bioguide']), #('pageid',crfile['id']), #('text',''), ('text', speech_remove_stop_words), #('turn',speech['turn']) ]) speeches_republican.append(speech_row_R) #print(str(keybioguideid) + "R") '''if len(speech_remove_stop_words): #print(speech_remove_stop_words) speeches_republican.append(speech_row_R) else: pass''' else: keybioguideid = 'dummy' #print(str(keybioguideid)) #pr.find_people(pr(),'','') # SPEECHES FOR THE SPEECH THRONE #print(speeches_republican) #print(speeches_democratic) #print(democratic_data_output) speechstack.add(speeches_republican) speechstack1.add(speeches_democratic) import json #print(democratic_data_output) with open('democratic_speeches.txt', 'a+') as out_json: out_json.write(democratic_data_output) with open('republican_speeches.txt', 'a+') as out_json: out_json.write(democratic_data_output) def find_people(self): mbrs = self.doc_ref.find_all('congmember') if mbrs: for mbr in mbrs: self.speakers[mbr.find('name', {'type':'parsed'}).string] = \ self.people_helper(mbr) '''def people_helper(self,tagobject): output_dict = {} if 'bioguideid' in tagobject.attrs: output_dict['bioguideid'] = tagobject['bioguideid'] elif 'bioGuideId' in tagobject.attrs: output_dict['bioguideid'] = tagobject['bioGuideId'] else: output_dict['bioguideid'] = 'None' for key in ['chamber','congress','party','state','role']: if key in tagobject.attrs: output_dict[key] = tagobject[key] else: output_dict[key] = 'None' try: output_dict['name_full'] = tagobject.find('name',{'type':'authority-fnf'}).string except: output_dict['name_full'] = 'None' #print(output_dict) return output_dict # Flow control for metadata generation def gen_file_metadata(self): # Sometimes the searchtitle has semicolons in it so .split(';') is a nogo temp_ref = self.cr_dir.mods.find('accessid', text=self.access_path) if temp_ref is None: raise RuntimeError("{} doesn't have accessid tag".format(self.access_path)) self.doc_ref = temp_ref.parent matchobj = re.match(self.re_vol, self.doc_ref.searchtitle.string) if matchobj: self.doc_title, self.cr_vol, self.cr_num = matchobj.group('title','vol','num') else: logging.warn('{0} yields no title, vol, num'.format( self.access_path)) self.doc_title, self.cr_vol, self.cr_num = \ 'None','Unknown','Unknown' self.find_people() self.find_related_bills() self.find_related_laws() self.find_related_usc() self.find_related_statute() self.date_from_entry() self.chamber = self.doc_ref.granuleclass.string self.re_newspeaker = self.make_re_newspeaker() self.item_types['speech']['patterns'] = [self.re_newspeaker]''' def __init__(self, start, **kwargs): """ BE SURE TO INCLUDE do_mode='yield' in kwargs! This object handles flow control for new data entering a Postgres database using congressionalrecord2s data model. It breaks the incoming Python dictionaries into three stacks of rows, one for each table in this data model. It writes the results to each of three flatfiles suitable for a bulk update through COPY. This is the way to minimize the number of transactions to the database, which we want. """ kwargs['do_mode'] = 'yield' if 'csvpath' in kwargs: pass else: kwargs['csvpath'] = 'dbfiles' pagepath, billpath, speechpath, speechpath1 = [ os.path.join(kwargs['csvpath'], filename) for filename in ['pages.csv', 'bills.csv', 'speeches_R.csv', 'speeches_D.csv'] ] self.downloader = dl(start, **kwargs) self.doc_ref = '' memberlistfinal = [] #object1 = congressionalrecord.fdsys.cr_parser.ParseCRDir() #print(object1) #self.cr_dir = '<congressionalrecord.fdsys.cr_parser.ParseCRDir object at 0x7f0c7c88cb90>' #self.cr_dir=cr_dir #self.gen_file_metadata() #print(pr.find_people(pr(self,''))) #self.find_people() #print('anannya'+str(pr.memberlist)) #print(pr('/home/anannyadas/Desktop/congress/congressional-record-master/congressionalrecord/pg_run/fdsys')) self.page_fields = [ 'pageid', 'title', 'chamber', 'extension', 'cr_day', 'cr_month', 'cr_year', 'num', 'vol', 'pages', 'wkday' ] self.bill_fields = [ 'congress', 'context', 'bill_type', 'bill_no', 'pageid' ] #self.speech_fields = ['speechid','affiliation','speaker','speaker_bioguide','pageid','text','turn'] self.speech_fields = ['affiliation', 'speaker', 'text'] pagestack = crPages(pagepath, self.page_fields) billstack = crBills(billpath, self.bill_fields) speechstack = crSpeeches(speechpath, self.speech_fields) speechstack1 = crSpeeches(speechpath1, self.speech_fields) for crfile in self.downloader.yielded: #print(crfile) doc = crfile.crdoc self.ingest(doc, pagestack, billstack, speechstack, speechstack1) # pagestack.write() # billstack.write() speechstack.write() speechstack1.write()
class ParseCRFile(object): # Some regex re_time = r'^CREC-(?P<year>[0-9]{4})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})-.*' re_vol = r'^(?P<title>.*); Congressional Record Vol. (?P<vol>[0-9]+), No. (?P<num>[0-9]+)$' re_vol_file = r'^\[Congressional Record Volume (?P<vol>[0-9]+), Number (?P<num>[0-9]+)'\ + r' \((?P<wkday>[A-Za-z]+), (?P<month>[A-Za-z]+) (?P<day>[0-9]+), (?P<year>[0-9]{4})\)\]' re_chamber = r'\[(?P<chamber>[A-Za-z\s]+)\]' re_pages = r'\[Page[s]? (?P<pages>[\w\-]+)\]' re_trail = r'From the Congressional Record Online'\ + r' through the Government (Publishing|Printing) Office \[www.gpo.gov\]$' re_rollcall = r'\[Roll(call)?( Vote)? No. \d+.*\]' re_recorderstart = (r'^\s+(?P<start>' + r'(The (assistant )?legislative clerk read as follows)' + r'|(The nomination considered and confirmed is as follows)' + r'|(The (assistant )?legislative clerk)' + r'|(The nomination was confirmed)' + r'|(There being no objection, )' + r'|(The resolution .*?was agreed to.)' + r'|(The preamble was agreed to.)' + r'|(The resolution .*?reads as follows)' + r'|(The assistant editor .*?proceeded to call the roll)' + r'|(The bill clerk proceeded to call the roll.)' + r'|(The bill clerk called the roll.)' + r'|(The motion was agreed to.)' #+ r'|(The Clerk read the resolution, as follows:)' + r'|(The Clerk read (the resolution, )as follows:)' + r'|(The resolution(, with its preamble,)? reads as follows:)' + r'|(The amend(ment|ed).*?(is)? as follows:)' + r'|(Amendment No\. \d+.*?is as follows:)' + r'|(The yeas and nays resulted.*?, as follows:)' + r'|(The yeas and nays were ordered)' + r'|(The result was announced.*?, as follows:)' + r'|(The .*?editor of the Daily Digest)' + r'|(The (assistant )?bill clerk read as follows:)' + r'|(The .*?read as follows:)' + r'|(The text of the.*?is as follows)' + r'|(amended( to read)? as follows:)' + r'|(The material (previously )?referred to (by.*?)?is as follows:)' + r'|(There was no objection)' + r'|(The amendment.*?was agreed to)' + r'|(The motion to table was .*)' + r'|(The question was taken(;|.))' + r'|(The following bills and joint resolutions were introduced.*)' + r'|(The vote was taken by electronic device)' + r'|(A recorded vote was ordered)' #+ r'|()' + r').*') # anchored at the end of the line re_recorderend = (r'(' + r'(read as follows:)' + r'|(the Record, as follows:)' + r'|(ordered to lie on the table; as follows:)' + r'|(resolutions as follows:)' + r')$') # sometimes the recorder says something that is not unique to them but # which, in the right context, we take to indicate a recorder comment. re_recorder_fuzzy = (r'^\s+(?P<start>' + r'(Pending:)' + r'|(By M(r|s|rs)\. .* \(for .*)' #+ r'|()' + r').*') # NCJ's broader version below, tested on one day of the record. # works, honest re_recorder_ncj = (r'^\s+(?P<start>' + r'(Pending:)' + r'|(By M(r|rs|s|iss)[\.]? [a-zA-Z]+))' ) re_clerk = r'^\s+(?P<start>The Clerk (read|designated))' re_allcaps = r'^ \s*(?!([_=]+|-{3,}))(?P<title>([A-Z]+[^a-z]+))$' re_linebreak = r'\s+([_=]+|-{5,})(NOTE|END NOTE)?([_=]+|-{5,})*\s*' re_excerpt = r'\s+(_{3,4})' re_newpage = r'\s*\[\[Page \w+\]\]' re_timestamp = r'\s+\{time\}\s+\d{4}' # Metadata-making functions def title_id(self): id_num = self.num_titles self.num_titles += 1 return id_num def make_re_newspeaker(self): speaker_list = '|'.join([mbr for mbr in list(self.speakers.keys()) \ if self.speakers[mbr]['role'] == 'SPEAKING']) if len(speaker_list) > 0: re_speakers = r'^(\s{1,2}|<bullet>)(?P<name>((' + speaker_list + ')|(((Mr)|(Ms)|(Mrs)|(Miss))\. (([-A-Z\'])(\s)?)+( of [A-Z][a-z]+)?)|(((The ((VICE|ACTING|Acting) )?(PRESIDENT|SPEAKER|CHAIR(MAN)?)( pro tempore)?)|(The PRESIDING OFFICER)|(The CLERK)|(The CHIEF JUSTICE)|(The VICE PRESIDENT)|(Mr\. Counsel [A-Z]+))( \([A-Za-z.\- ]+\))?)))\.' else: re_speakers = r'^(\s{1,2}|<bullet>)(?P<name>((((Mr)|(Ms)|(Mrs)|(Miss))\. (([-A-Z\'])(\s)?)+( of [A-Z][a-z]+)?)|((The ((VICE|ACTING|Acting) )?(PRESIDENT|SPEAKER|CHAIR(MAN)?)( pro tempore)?)|(The PRESIDING OFFICER)|(The CLERK)|(The CHIEF JUSTICE)|(The VICE PRESIDENT)|(Mr\. Counsel [A-Z]+))( \([A-Za-z.\- ]+\))?))\.' return re_speakers def people_helper(self,tagobject): output_dict = {} if 'bioguideid' in tagobject.attrs: output_dict['bioguideid'] = tagobject['bioguideid'] elif 'bioGuideId' in tagobject.attrs: output_dict['bioguideid'] = tagobject['bioGuideId'] else: output_dict['bioguideid'] = 'None' for key in ['chamber','congress','party','state','role']: if key in tagobject.attrs: output_dict[key] = tagobject[key] else: output_dict[key] = 'None' try: output_dict['name_full'] = tagobject.find('name',{'type':'authority-fnf'}).string except: output_dict['name_full'] = 'None' #print(output_dict) #cr.memberlistfinal.append(output_dict) ''' if 'json' not in os.listdir(outpath): os.mkdir(os.path.join(outpath,'json'))''' with open('json/'+output_dict['bioguideid']+'.json','w+') as out_json: json.dump(output_dict,out_json) return output_dict def find_people(self): mbrs = self.doc_ref.find_all('congmember') memberlist = mbrs #print(memberlist) if mbrs: for mbr in mbrs: self.speakers[mbr.find('name', {'type':'parsed'}).string] = \ self.people_helper(mbr) def find_related_bills(self): related_bills = self.doc_ref.find_all('bill') if len(related_bills) > 0: self.crdoc['related_bills'] = \ [bill.attrs for bill in related_bills] def find_related_laws(self): related_laws = self.doc_ref.find_all('law') if len(related_laws) > 0: self.crdoc['related_laws'] = \ [law.attrs for law in related_laws] def find_related_usc(self): related_usc = self.doc_ref.find_all('uscode') if len(related_usc) > 0: self.crdoc['related_usc'] = list( itertools.chain.from_iterable( [[dict([('title',usc['title'])] + list(sec.attrs.items())) for sec in usc.find_all('section')] for usc in related_usc] ) ) def find_related_statute(self): related_statute = self.doc_ref.find_all('statuteatlarge') if len(related_statute) > 0: self.crdoc['related_statute'] = list( itertools.chain.from_iterable( [[dict([('volume',st['volume'])] + list(pg.attrs.items())) for pg in st.find_all('pages')] for st in related_statute] ) ) def date_from_entry(self): year, month, day = re.match(self.re_time,self.access_path).group('year','month','day') if self.doc_ref.time: from_hr,from_min,from_sec = self.doc_ref.time['from'].split(':') to_hr,to_min,to_sec = self.doc_ref.time['to'].split(':') try: self.doc_date = datetime(int(year),int(month),int(day)) self.doc_start_time = datetime(int(year),int(month),int(day),\ int(from_hr),int(from_min),int(from_sec)) self.doc_stop_time = datetime(int(year),int(month),int(day),\ int(to_hr),int(to_min),int(to_sec)) self.doc_duration = self.doc_stop_time - self.doc_start_time except: logging.info('Could not extract a document timestamp.') # Flow control for metadata generation def gen_file_metadata(self): # Sometimes the searchtitle has semicolons in it so .split(';') is a nogo temp_ref = self.cr_dir.mods.find('accessid', text=self.access_path) #print(type(self.cr_dir)) if temp_ref is None: raise RuntimeError("{} doesn't have accessid tag".format(self.access_path)) self.doc_ref = temp_ref.parent matchobj = re.match(self.re_vol, self.doc_ref.searchtitle.string) if matchobj: self.doc_title, self.cr_vol, self.cr_num = matchobj.group('title','vol','num') else: logging.warn('{0} yields no title, vol, num'.format( self.access_path)) self.doc_title, self.cr_vol, self.cr_num = \ 'None','Unknown','Unknown' self.find_people() self.find_related_bills() self.find_related_laws() self.find_related_usc() self.find_related_statute() self.date_from_entry() self.chamber = self.doc_ref.granuleclass.string self.re_newspeaker = self.make_re_newspeaker() self.item_types['speech']['patterns'] = [self.re_newspeaker] # That's it for metadata. Below deals with content. def read_htm_file(self): """ This function updates a self.cur_line attribute. So now for each call to the iterator there are two pointers to the next line - one for the function, and one for the object. The purpose of the attribute is to give each parsing function a "starting position" so that the handshake between functions is easier. Now the current (or last) line is tracked in only one place and the same way by all object methods. """ self.lines_remaining = True with open(self.filepath, 'r') as htm_file: htm_lines = htm_file.read() htm_text = BeautifulSoup(htm_lines,"lxml") text = htm_text.pre.text.split('\n') for line in text: self.cur_line = line yield line self.lines_remaining = False def get_header(self): """ Only after I wrote this did I realize how bad things can go when you call next() on an iterator instead of treating it as a list. This code works, though. """ header_in = next(self.the_text) if header_in == u'': header_in = next(self.the_text) match = re.match(self.re_vol_file, header_in) if match: vol, num, wkday, month, day, year = match.group( \ 'vol','num','wkday','month','day','year') else: return False header_in = next(self.the_text) match = re.match(self.re_chamber, header_in) if match: if match.group('chamber') == 'Extensions of Remarks': chamber = 'House' extensions = True else: chamber = match.group('chamber') extensions = False else: return False header_in = next(self.the_text) match = re.match(self.re_pages, header_in) if match: pages = match.group('pages') else: return False header_in = next(self.the_text) match = re.match(self.re_trail, header_in) if match: pass else: return False return vol, num, wkday, month, day, year, chamber, pages, extensions def write_header(self): self.crdoc['id'] = self.access_path header = self.get_header() if header: self.crdoc['header'] = {'vol':header[0],'num':header[1],\ 'wkday':header[2],'month':header[3],'day':header[4],\ 'year':header[5],'chamber':header[6],'pages':header[7],\ 'extension':header[8]} self.crdoc['doc_title'] = self.doc_title def get_title(self): """ Throw out empty lines Parse consecutive title-matching strings into a title str Stop on the first line that isn't empty and isn't a title Return the title str if it exists. We pretty much assume the first title on the page applies to everything below it """ title_str = '' for line in self.the_text: if line == u'': pass else: a_match = re.match(self.re_allcaps, line) if a_match: title_str = ' '.join([title_str,a_match.group('title')]) else: break if len(title_str) > 0: return title_str.strip() else: return False def write_page(self): turn = 0 itemno = 0 title = self.get_title() the_content = [] if title: self.crdoc['title'] = title else: self.crdoc['title'] = None while self.lines_remaining: # while not re.match(self.re_allcaps,self.cur_line): try: item = crItem(self).item if item['kind'] == 'speech': item['turn'] = turn turn += 1 item['itemno'] = itemno itemno += 1 the_content.append(item) except Exception as e: logging.warn('{0}'.format(e)) break self.crdoc['content'] = the_content logging.debug('Stopped writing {0}. The last line is: {1}'.format(self.access_path,self.cur_line)) def parse(self): """ Flow control for parsing content. """ self.the_text = self.read_htm_file() self.write_header() self.write_page() """ This is a dict of line cases. In previous versions, these relations were called explicitly multiple times in multiple places. This way is more extensible and easier to track cases. Usage: If break_flow == True: <interrupt current item> If speaker_re == True: speaker = re.match(line, <pattern from patterns>). .group(<speaker_group>) else: speaker = <speaker> (ALSO -- see line 176 for how speech patterns is populated) It has to come after some of the functions because of how I want to handle special cases. """ item_types = { 'speech': {'patterns':['Mr. BOEHNER'], 'speaker_re':True, 'speaker_group':'name', 'break_flow':True, 'special_case':False }, 'recorder': {'patterns':[re_recorderstart, re_recorderend, re_recorder_ncj], 'speaker_re':False, 'speaker':'The RECORDER', 'break_flow':True, 'special_case':False }, 'clerk': {'patterns':[re_clerk], 'speaker_re':False, 'speaker':'The Clerk', 'break_flow':True, 'special_case':False }, 'linebreak': {'patterns':[re_linebreak], 'speaker_re':False, 'speaker':'None', 'break_flow':True, 'special_case':True, 'condition':'emptystr' }, 'excerpt': {'patterns':[re_excerpt], 'speaker_re':False, 'speaker':'None', 'break_flow':True, 'special_case':True, 'condition':'lastspeaker' }, 'rollcall': {'patterns':[re_rollcall], 'speaker_re':False, 'speaker':'None', 'break_flow':True, 'special_case':False }, 'metacharacters': {'patterns':[re_timestamp, re_newpage], 'speaker_re':False, 'speaker':'None', 'break_flow':False, 'special_case':False }, 'empty_line': {'patterns':[r'(^[\s]+$)'], 'speaker_re':False, 'speaker':'None', 'break_flow':False, 'special_case':False }, 'title': {'patterns':[re_allcaps], 'speaker_re':False, 'speaker':'None', 'break_flow':True, 'special_case':False, } } stemmer = Porter2Stemmer() #remove stop words def remove_stop_words(self,frequency_list): stop_words = [] with open("stop_words_list.txt") as stop_words_list: stop_words_data = stop_words_list.read() stop_words = stop_words_data.split(',') temp_list = [] stemmer = Porter2Stemmer() try: for c in string.punctuation: frequency_list= frequency_list.replace(c,"") querywords = frequency_list.split() resultwords = [stemmer.stem(word) for word in querywords if word.lower() not in stop_words] result = ' '.join(resultwords) except ValueError: print "no such value" #print(result) return result def __init__(self, abspath, cr_dir, **kwargs): # Some metadata self.crdoc = {} self.crdoc['header'] = False self.crdoc['content'] = [] self.num_titles = 0 self.speakers = {} self.doc_ref = '' self.doc_time = -1 self.doc_start_time = -1 self.doc_stop_time = -1 self.doc_duration = -1 self.doc_chamber = 'Unspecified' self.doc_related_bills = [] # file data self.filepath = abspath self.filedir, self.filename = os.path.split(abspath) self.cr_dir = cr_dir #print(cr_dir) self.access_path = self.filename.split('.')[0] # Generate all metadata including list of speakers self.gen_file_metadata() # Must come after speaker list generation self.item_breakers = [] self.skip_items = [] for x in list(self.item_types.values()): if x['break_flow'] == True: self.item_breakers.extend(x['patterns']) else: self.skip_items.extend(x['patterns']) # Parse the file self.parse() #print( self.crdoc['content']) for speech in self.crdoc['content']: #print("Code running") if speech['kind'] == 'speech': #print(speech['text']) if speech['speaker_bioguide']: keybioguideid = speech['speaker_bioguide'] outpath = os.path.join('','json',keybioguideid+'.json') with open(outpath) as json_data: d = json.load(json_data) if d['party']=='D': print("D") with open('democratic/'+speech['speaker']+'-'+str(keybioguideid)+'.txt','a+') as out_json: out_json.write(str(self.remove_stop_words(speech['text']))+'\n') os.chmod('democratic/'+speech['speaker']+'-'+str(keybioguideid)+'.txt', 0o777) with open('democratic_speeches.txt','a+') as out_json: out_json.write(str(self.remove_stop_words(speech['text']))+'\n') elif d['party'] =='R': print("R") with open('republican/'+speech['speaker']+'-'+str(keybioguideid)+'.txt','a+') as out_json: out_json.write(str(self.remove_stop_words(speech['text']))+'\n') os.chmod('republican/'+speech['speaker']+'-'+str(keybioguideid)+'.txt', 0o777) with open('republican_speeches.txt','a+') as out_json: out_json.write(str(self.remove_stop_words(speech['text']))+'\n') '''
def index_file(file_name, documentID): stemmer = Porter2Stemmer() k = kgram_index() # Dealing with punctuation p = dict.fromkeys(string.punctuation) p.pop('-') # we need to deal with hyphens punctuation = str.maketrans(p) weight_map = {} try: with open(file_name) as json_file: article_data = json.load(json_file) body = unidecode.unidecode( article_data['body']).lower().translate(punctuation).split(' ') body = list( filter(lambda t: t != '' and t != '-', body)) # remove single spaces and single hyphens #kgram stuff here position = 0 for term in body: #kgram stuff here kgram_list = [] # develop a list of kgram tokens for one specific term # kgram doesn't need to deal with hyphens because the tokens will be created anyways for i in range(1, 4): if i is 1: kgram_list.extend(k.create_kgram(term, i)) else: s = ('$' + term + '$') kgram_list.extend(k.create_kgram(s, i)) # Shove each of those tokens into the grand vocab dictionary for token in kgram_list: if token in vocab: vocab[token].add(term) else: vocab[token] = set([term]) # take care of hyphenated words if '-' in term: unhyphenated_word = term.replace('-', '') index.add_term(stemmer.stem(unhyphenated_word), documentID, position) hyphened_tokens = term.split('-') for t in hyphened_tokens: index.add_term(stemmer.stem(t), documentID, position) else: index.add_term(stemmer.stem(term), documentID, position) position += 1 if term not in weight_map: weight_map[term] = 1 else: weight_map[term] = weight_map[term] + 1 except FileNotFoundError as e: print(e) wdt = 0 i_writer = index_writer() # Gets the Wdt's of the terms in the file for tf in weight_map: wdt += pow(1 + log(weight_map[tf]), 2) Ld = sqrt(wdt) i_writer.write_ld(Ld)
def main(): # Instances # w = wildcard() n = near() # directory = input('Enter directory for index: ') # TODO Revert back to original when done # TODO This is for testing purposes, so i can compare output # test_dir = '/Users/Cemo/Documents/cecs429/search_engine/corpus/mlb_documents' # test_dir = '/Users/Cemo/Documents/cecs429/search_engine/corpus/kumin' # test_dir = '/Users/Cemo/Documents/cecs429/search_engine/corpus/disk_test' cwd = getcwd() start_time = time.time() corpus_size = len( listdir( '/Users/Cemo/Documents/cecs429/search_engine/corpus/all-nps-sites') ) # init(test_dir) print("--- %s seconds ---" % str((time.time() - start_time) / 60)) while 1: chdir( cwd ) # Changing to the directory of with the DB file in it for sqlite query_or_index = input('[1] - Query\n[2] - Index\n') print(query_or_index) if query_or_index == '1': query_type = input('[1] - Rank\n[2] - Boolean\n') if query_type == '1': r = rank() q = input('Enter query: ') r.get_rank(q, corpus_size) # print(r.get_rank(q, corpus_size)) # print(r.get_rank('wildfire in yosemite', corpus_size)) else: return_docs = [] user_string = input("Please enter a word search:\n") # Special Queries if ':' in user_string: if ':q' in user_string: exit() if ':stem' in user_string: stemmer = Porter2Stemmer() print("Will be stemming the token") print(user_string.split(" ")[1]) print(stemmer.stem(user_string.split(" ")[1])) if ':index' in user_string: print('Will be indexing folder') init(user_string.split(" ")[1].rstrip().lstrip()) if ':vocab' in user_string: pp = pprint.PrettyPrinter(indent=4) pp.pprint(index.get_dictionary()) print('Total number of vocabulary terms: ' + str(index.get_term_count())) print('Will be spitting out words') elif '*' in user_string: print("This will get sent of to the wildcard class") return_docs.extend(wild(user_string)) elif 'near' in user_string: # Parse NEAR input near_parts = user_string.split(' ') k = near_parts[1].split('/') return_docs.extend( n.near(index, near_parts[0], near_parts[2], int(k[1]))) else: if user_string: q = Query() return_docs = q.query_parser(user_string) else: print('No query entered') print('DOC_LIST: ' + str(return_docs)) # Allow the user to select a document to view doc_list = list(map(document_parser, return_docs)) if len(doc_list) != 0: for document in doc_list: print('Document ' + document) print('Documents found: ' + str(len(doc_list))) document_selection = input( 'Please select a document you would like to view: ') while document_selection != 'no': if document_selection in doc_list: open_file_content(document_selection) document_selection = input( 'Please select a document you would like to view: ' ) else: print('No documents were found') else: print('Please dont') directory = input('Enter directory for index: ' ) # TODO Revert back to original when done init(directory) i_writer = index_writer() i_writer.write_index_to_disk(index.get_index())
def stemming(): stemmer = Porter2Stemmer() for w in words(): print(w, stemmer.stem(w), sep='\t')
import re from porter2stemmer import Porter2Stemmer from random import shuffle import math import numpy as np import matplotlib.pyplot as plt ##### Stop words stoplist = [] f = open("/home/sree/Machine-Learning-course/Assignment2/stopwords.txt",'r') for line in f: stoplist.append(line.replace("\n","")) #print stoplist stemmer = Porter2Stemmer() ##### Remove stop words from messages and form 2-d list wosw = [] f = open("/home/sree/Machine-Learning-course/Assignment2/Assignment_2_data.txt",'r') for line in f: z = re.split('\t| |;|,|\*|\n|\.',line) b = filter(lambda a: a != '', z) l3 = [stemmer.stem(x) for x in b if x.lower() not in stoplist] wosw.append(l3) shuffle(wosw) #print wosw ##### Make a list of tokens tokens = [] for row in wosw:
def test_index_txt_file(): txt_index = positional_inverted_index() stemmer = Porter2Stemmer() k = kgram_index() file_names = [] documentID = 1 # Dealing with punctuation p = dict.fromkeys(string.punctuation) p.pop('-') # we need to deal with hyphens punctuation = str.maketrans(p) directory = path.dirname(path.realpath(__file__)) + '/unit_test_docs/' chdir(directory) for file in listdir(directory): if file.endswith('.txt'): file_names.append(str(file)) for file in file_names: try: with open(file) as txt_file: content = txt_file.readlines(); content = content[0].lower().translate(punctuation).split(' ') content = list(filter(lambda w: w != '', map(lambda s: s.strip(), content))) positions_dict = {} for i in range(0, len(content)): if '-' in content[i]: hyphened_word_parts = content[i].split('-') hyphened_word = content[i].replace('-', '') hyphened_word_parts.append(hyphened_word) for word in hyphened_word_parts: if word in positions_dict: positions_dict[word].append(i) else: positions_dict[word] = [i] else: if content[i] in positions_dict: positions_dict[content[i]].append(i) else: positions_dict[content[i]] = [i] for key in positions_dict: txt_index.add_term(stemmer.stem(key), documentID, positions_dict[key]) except FileNotFoundError as e: i = 0 print(e) documentID = documentID + 1 for key in txt_index.get_index(): txt_index.print_term_info(key) correct_map = {} correct_map['today'] = [posting(1, [0]), posting(2, [0]), posting(3, [0])] correct_map['i'] = [posting(1, [1, 6, 11]), posting(2, [1]), posting(3, [1]), posting(4, [0])] correct_map['fell'] = [posting(1, [2])] correct_map['in'] = [posting(1, [3])] correct_map['a'] = [posting(1, [4])] correct_map['well'] = [posting(1, [5])] correct_map['have'] = [posting(1, [7]), posting(4, [1])] correct_map['no'] = [posting(1, [8]), posting(5, [8])] correct_map['mouth'] = [posting(1, [9])] correct_map['but'] = [posting(1, [10])] correct_map['want'] = [posting(1, [12])] correct_map['to'] = [posting(1, [13])] correct_map['scream'] = [posting(1, [14])] correct_map['top'] = [posting(2, [2])] correct_map['deck'] = [posting(2, [3])] correct_map['lethal'] = [posting(2, [4])] correct_map['yogg'] = [posting(2, [5])] correct_map['saron'] = [posting(2, [5])] correct_map['yoggsaron'] = [posting(2, [5])] correct_map['f**k'] = [posting(2, [6])] correct_map['me'] = [posting(2, [7]), posting(4, [8])] correct_map['over'] = [posting(2, [8])] correct_map['super'] = [posting(2, [9])] correct_map['hard'] = [posting(2, [10])] correct_map['learn'] = [posting(3, [2])] correct_map['the'] = [posting(3, [3]), posting(5, [2])] correct_map['mean'] = [posting(3, [4])] correct_map['of'] = [posting(3, [5])] correct_map['pain'] = [posting(3, [6])] correct_map['it'] = [posting(3, [7]), posting(4, [9]), posting(5, [12])] correct_map['was'] = [posting(3, [8]), posting(4, [10])] correct_map['all'] = [posting(3, [9])] correct_map['caus'] = [posting(3, [10])] correct_map['by'] = [posting(3, [11])] correct_map['nealdt'] = [posting(3, [12])] correct_map['ascend'] = [posting(4, [2])] correct_map['into'] = [posting(4, [3])] correct_map['enlighten'] = [posting(4, [4])] correct_map['my'] = [posting(4, [5])] correct_map['waifu'] = [posting(4, [6])] correct_map['told'] = [posting(4, [7])] correct_map['actual'] = [posting(4, [11])] correct_map['okay'] = [posting(4, [12])] correct_map['jesus'] = [posting(5, [0])] correct_map['take'] = [posting(5, [1])] correct_map['wheel'] = [posting(5, [3])] correct_map['or'] = [posting(5, [4])] correct_map['els'] = [posting(5, [5])] correct_map['asian'] = [posting(5, [6])] correct_map['driver'] = [posting(5, [7])] correct_map['survivor'] = [posting(5, [9])] correct_map['dont'] = [posting(5, [10])] correct_map['let'] = [posting(5, [11])] correct_map['happen'] = [posting(5, [13])] for keys in txt_index.get_index(): assert keys in correct_map
class RecommendationModule: data_list = None data_n = None cleaned_data_list = [] vocab = set() stop_words = set() # set of stop words stemmer = Porter2Stemmer() # init porter stemmer config_path = None config = None conn = None def __init__(self): """ init config, stop words, data list, database """ self.config_path = 'config.ini' self.config = configparser.ConfigParser() self.config.read(self.config_path, 'utf-8') with open(self.config['DEFAULT']['STOPWORDS_PATH'], encoding='utf-8') as f: self.stop_words = set(f.read().split()) self.conn = sqlite3.connect(self.config['DEFAULT']['SE_DB_PATH']) self.conn.row_factory = sqlite3.Row c = self.conn.cursor() self.data_list = c.execute('select * from recipes').fetchall() self.data_n = len(self.data_list) c.execute('drop table if exists k_nearest') c.execute( 'create table k_nearest (id integer primary key, ' 'nn1 integer, nn2 integer, nn3 integer, nn4 integer, nn5 integer)') self.conn.commit() def __del__(self): """ close the database :return: """ self.conn.close() def get_list_maxnum_index(self, num_list, top): """ get the index of the maximum number in the list :param num_list: :param top: :return: """ num_dict = {} for i in range(len(num_list)): num_dict[i] = num_list[i] res_list = sorted(num_dict.items(), key=lambda e: e[1]) max_num_index = [one[0] for one in res_list[::-1][:top]] return list(max_num_index) def data_cleanup_tf(self, data): """ clean data and construct tf dictionary :param data: :return: length of data and tf dictionary """ tf_dict = {} # {term: tf, ...} n = 0 # length of data terms = data.lower().split() # lower the data and split for term in terms: # filter stop words having quotation marks # filter sites in a simple way if (term not in self.stop_words) and ('http' not in term) and ( 'www' not in term): term = re.sub(r'[^a-z]', '', term) # remove non-alphabetic letters # filter stop words again and blank term if (term not in self.stop_words) and (len(term) != 0): term = self.stemmer.stem(term) # stemming n += 1 if term in tf_dict: tf_dict[term] += 1 else: tf_dict[term] = 1 return n, tf_dict def construct_data_vocab(self): """ construct vocabulary with only title :return: """ for recipe in self.data_list: name = recipe['name'] # ingredients = recipe['ingredients'] term_tf = self.data_cleanup_tf(name)[1] for term in term_tf.keys(): self.vocab.add(term) self.cleaned_data_list.append(list(term_tf.keys())) def write_row_to_db(self, rid_self, rid_list): """ write each row into database :param rid_cur: :param rid_list: :return: """ c = self.conn.cursor() values = (rid_self, rid_list[0], rid_list[1], rid_list[2], rid_list[3], rid_list[4]) c.execute('insert into k_nearest values (?, ?, ?, ?, ?, ?)', values) self.conn.commit() def construct_k_nearest(self): """ construct the k nearest rid :return: """ word2id = {} for word_id, word in enumerate(self.vocab): word2id[word] = word_id row2rid = {} # convert the row id to the recipe id matrix_size = (self.data_n, len(word2id)) X = dok_matrix(matrix_size) for i, recipe in enumerate(self.data_list): rid = recipe['id'] name = recipe['name'] row2rid[i] = rid term_tf = self.data_cleanup_tf(name)[1] for term, tf in term_tf.items(): X[i, word2id[term]] = tf knn = NearestNeighbors(n_neighbors=6).fit(X) for row, x in enumerate(self.cleaned_data_list): x_in = dok_matrix((1, len(word2id))) for term in x: x_in[0, word2id[term]] += 1 # print(word2id[term]) # print(x_in) neighbours = knn.kneighbors(x_in, 6, return_distance=False)[0] rid_self = row2rid[row] rid_list = list( set([row2rid[row] for row in neighbours]) - set([rid_self])) # print(neighbours) # print([row2rid[row] for row in neighbours]) # print(rid_self) # print(rid_list) self.write_row_to_db(rid_self, rid_list) # dictionary = corpora.Dictionary(cleaned_data_list) # generate the dictionary # corpus = [dictionary.doc2bow(item) for item in cleaned_data_list] # tfidf = models.TfidfModel(corpus) # num_features = len(dictionary.token2id.keys()) # number of terms in the dictionary # index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=num_features) # for i, data in enumerate(cleaned_data_list): # vector = dictionary.doc2bow(data) # convert to svm # sims = index[tfidf[vector]] # row_list = self.get_list_maxnum_index(list(sims), 6) # rid_list = [row2rid[row] for row in row_list] # self.write_row_to_db(row2rid[i], rid_list.remove(row2rid[i])[:5]) def find_k_nearest(self): """ find the k nearest rid and write to database :return: """ self.construct_data_vocab() self.construct_k_nearest()
def doStem(word): stemmer = Porter2Stemmer() return stemmer.stem(word)
truncate_similarities() # In case the data is read from the database then the following is required. # article_master = import_content() # LOCAL IMPORT article_master = pd.read_csv(os.path.abspath("./data/content_metadata.csv")) ## PREPROCESS CONTENT print("Previous Model Truncated.") print("Pre-processing....") # REDUCE CONTENT: article_master['reduced_content'] = article_master.apply \ (lambda row: re.sub('[^a-z\s]', '', filter_html(row.bodytext).lower()), axis=1) snowball = Porter2Stemmer() article_master['stemmed_content'] = article_master.apply \ (lambda row: text_stemmer(row.reduced_content, snowball), axis=1) article_master['stemmed_content'] = article_master['stemmed_content'].fillna( '') # REDUCE TITLE: # It must be noted that numbers are removed from the content and not from the title article_master['reduced_title'] = article_master.apply \ (lambda row: re.sub('[^a-z0-9\s]', '', row.title.lower()), axis=1) article_master['stemmed_title'] = article_master.apply \ (lambda row: text_stemmer(row.reduced_title, snowball), axis=1)
class Index_name(dbbase): __table__ = Table('index_name', md, autoload=True) stop_words = get_stopwords() stemmer = Porter2Stemmer() DATA_N = 163249 AVG_LEN = 33.782259003117936 def fetch_from_db(self, term): """ fetch the corresponding index from database :param term: :param table_name: :return: """ row = dbsession.query(Index_name).filter_by(term=term).first() return row def data_cleanup_tf(self, data): """ clean data and construct tf dictionary :param data: :return: length of data and tf dictionary """ tf_dict = {} # {term: tf, ...} n = 0 # length of data terms = data.lower().split() # lower the data and split for term in terms: # filter stop words having quotation marks # filter sites in a simple way if (term not in self.stop_words) and ('http' not in term) and ('www' not in term): term = re.sub(r'[^a-z]', '', term) # remove non-alphabetic letters # filter stop words again and blank term if (term not in self.stop_words) and (len(term) != 0): term = self.stemmer.stem(term) # stemming n += 1 if term in tf_dict: tf_dict[term] += 1 else: tf_dict[term] = 1 return n, tf_dict def result_by_tfidf(self, query): """ query by tfidf, for only title :param query: :return: """ n, tf_dict = self.data_cleanup_tf(query) tfidf_scores = {} for term in tf_dict.keys(): r = self.fetch_from_db(term) if r is None: continue df = r.df idf = math.log(self.DATA_N / df) posting_list = r.postings.split('\n') for posting in posting_list: rid, tf, length = posting.split('\t') rid = int(rid) tf = int(tf) s = (1 + math.log(tf)) * idf * tf_dict[term] if rid in tfidf_scores: tfidf_scores[rid] = tfidf_scores[rid] + s else: tfidf_scores[rid] = s tfidf_scores = sorted(tfidf_scores.items(), key=operator.itemgetter(1)) tfidf_scores.reverse() result = [x[0] for x in tfidf_scores] # print(len(tfidf_scores), len(result)) if len(result) == 0: return 0, [] else: return 1, result
def stemming_tokenizer(self, text): stemmer = Porter2Stemmer() return [stemmer.stem(w) for w in word_tokenize(text)]