Example #1
0
    def remove_stop_words(self,frequency_list):

	stop_words = []

	with open("stop_words_list.txt") as stop_words_list:

		stop_words_data = stop_words_list.read()
		stop_words = stop_words_data.split(',')


    		temp_list = []
		stemmer = Porter2Stemmer()
		try:
			for c in string.punctuation:
		   		frequency_list= frequency_list.replace(c,"")

			querywords = frequency_list.split()
		
			resultwords  = [stemmer.stem(word) for word in querywords if word.lower() not in stop_words]

			result = ' '.join(resultwords)

		except ValueError:
	     		print "no such value"

	#print(result)    	
	return result
Example #2
0
 def __init__(self):
     self.stopwords = []
     self.stemmer = Porter2Stemmer()
     with open('stopwords.txt') as my_file:
         for line in my_file:
             self.stopwords.append(self.stemmer.stem(line[:-1]))
     self.ignorechars = ''',:'!()'''
Example #3
0
    def near(self, index, first_term, second_term, k):

        doc_list = set()

        # stemming words first, can remove this later
        stemmer = Porter2Stemmer()
        first_term = stemmer.stem(first_term)
        second_term = stemmer.stem(second_term)

        # Max number of iterations is the max size of the bigger list
        max_length = max(len(index.get_postings(first_term)),
                         len(index.get_postings(second_term)))

        f_postings_list = index.get_postings(first_term)
        s_postings_list = index.get_postings(second_term)
        i = 0
        j = 0

        # both_set = index.get_all_doc_ids(first_term).intersection(index.get_all_doc_ids(second_term))
        # the maximum number of times to iterate is the max length of the list
        while 1:
            if i + 1 > len(f_postings_list) or j + 1 > len(s_postings_list):
                return doc_list

            if f_postings_list[i].get_document_id(
            ) == s_postings_list[j].get_document_id():
                f_pos_list = f_postings_list[i].get_positions()
                s_pos_list = s_postings_list[j].get_positions()

                # for any position that is less that the first list, get rid of it
                # the only positions that matter are second positions after the first pos
                s_pos_list = list(
                    filter(lambda p: p > f_pos_list[0], s_pos_list))

                # second_pos - first_pos
                # we an return true for the first instance of true near

                for second_pos in s_pos_list:
                    # find the distances between second word and first
                    distances = list(
                        map(
                            lambda first_pos: ((second_pos - first_pos <= k)
                                               and second_pos > first_pos),
                            f_pos_list))
                    if any(list(map(lambda p: p <= k, distances))):
                        doc_list.add(f_postings_list[i].get_document_id())
                        break

                i += 1
                j += 1

            else:
                # increment as needed
                i += int((f_postings_list[i].get_document_id() <
                          s_postings_list[j].get_document_id()))
                j += int((f_postings_list[i].get_document_id() >
                          s_postings_list[j].get_document_id()))
Example #4
0
    def write_index_to_disk(self, index):
        """
        Writes to disk whatever index dictionary that is given, slow as hell though
        :param index: Index gets passed a dictionary
        :return:
        """
        stem = Porter2Stemmer()
        vb = vbe()
        position_term_db = position_db('/Users/Cemo/Documents/cecs429/search_engine/DB/term_positions_federalists.db')
        position_term_db.create_table()
        current_index = index
        sorted_key_list = sorted(index)
        index_binary_file = open('index_federalists.bin', 'wb')

        for key in sorted_key_list:
            if not key:
                continue
            position_term_db.add_term(stem.stem(key.lower()), index_binary_file.tell())

            disk_write_list = []
            df = len(current_index[key])
            for number in vb.encode_number(df):
                index_binary_file.write(pack(">B", number))

            postings = current_index[key]
            for i in range(len(current_index[key])):
                # TODO gaps seems to be working
                if i == 0:
                    doc_id = postings[i].get_document_id()
                else:
                    doc_id = postings[i].get_document_id() - postings[i-1].get_document_id()

                disk_write_list.append(doc_id)

                for number in vb.encode_number(doc_id):
                    index_binary_file.write(pack(">B", number))

                tf = postings[i].positions_list
                disk_write_list.append(len(tf))

                for number in vb.encode_number(len(tf)):
                    index_binary_file.write(pack(">B", number))
                for j in range(len(tf)):
                    # TODO gaps seems to be working
                    if j == 0:
                        disk_write_list.append(tf[j])
                        vb.encode_number(tf[j])
                        for number in vb.encode_number(tf[j]):
                            index_binary_file.write(pack(">B", number))

                    else:
                        disk_write_list.append(tf[j] - tf[j - 1])
                        for number in vb.encode_number(tf[j] - tf[j - 1]):
                            index_binary_file.write(pack(">B", number))
        position_term_db.close_connection_commit()
        index_binary_file.close()
Example #5
0
 def stop_stem(self, data):
     stop = open(
         "/home/lalit/Desktop/IR_LAB/Final_Index/Eng_code/stop_eng.txt"
     ).read().split("\n")
     stemwords = []
     stemmer = Porter2Stemmer()
     for word in data:
         if word not in stop:
             stemwords.append(stemmer.stem(word))
     l = list(stemwords)
     return l
Example #6
0
    def stem(self, list_of_words):
    # returns a list of stemmed words

        stemmed_list = []
        stemmer = Porter2Stemmer()

        for word in list_of_words:
            stemmed_word = stemmer.stem(word.lower())
            stemmed_list.append(stemmed_word)

        return stemmed_list
Example #7
0
    def remove_stop_words(self, frequency_list):

        stop_words = []
        #stop_words = get_stop_words('en')
        with open("stop_words_list.txt") as stop_words_list:
            #stop_words = [line.split(',') for line in stop_words_list if line.strip()]
            stop_words_data = stop_words_list.read()
            stop_words = stop_words_data.split(',')
        #frequency_list=frequency_list.split(" ")

        temp_list = []
        stemmer = Porter2Stemmer()
        #print(frequency_list)
        #stop_words.append(stop_words_data)
        #print(stop_words)
        try:
            '''for key in frequency_list:
				#print(key)
        			if key.lower() not in stop_words:
					
    							#print (key)
							
							#key = self.remove_custom_stop_words(key)
							temp_list.append([key])
							
				else:
					print("Removing:"+str(key))'''
            #frequency_list = ' '.join(e for e in frequency_list if e.isalnum())
            for c in string.punctuation:
                frequency_list = frequency_list.replace(c, "")
            #print(frequency_list)
            querywords = frequency_list.split()
            #querywords = ' '.join(e for e in frequency_list if e.isalnum())
            #print(querywords)

            #for stemwords in querywords:
            #print(stemwords,stemmer.stem(stemwords))

            resultwords = [
                stemmer.stem(word) for word in querywords
                if word.lower() not in stop_words
            ]

            result = ' '.join(resultwords)

            #print result

        except ValueError:
            print "no such value"

        #print(result)
        return result
Example #8
0
def index_file(file_name, documentID):
    stemmer = Porter2Stemmer()
    punctuation = str.maketrans(dict.fromkeys(string.punctuation))
    with open(file_name) as json_file:
        article_data = json.load(json_file)

        body = (article_data['body']).lower().translate(punctuation).split(' ')
        body = list(filter(lambda w: w != '', map(lambda s: s.strip(), body)))

        term_positions = find_positions(body)

        for key in term_positions:
            index.add_term(key, documentID, term_positions[key])
            stemmed_term = stemmer.stem(key)
            if (stemmed_term != key and not stemmed_term in index.m_index):
                index.add_term(stemmer.stem(key), documentID,
                               term_positions[key])
Example #9
0
    def link_promise(self, link):
        promise = 0.0
        stemmer = Porter2Stemmer()
        if link is None:
            return promise
        try:
            #get terms in the link
            link_terms = re.findall("\w+", link.lower())
            link_terms = [stemmer.stem(term) for term in link_terms]

        except:
            print("link error")
            return promise
        #get terms in the query
        link_terms_count = collections.Counter(link_terms)
        self.query_terms = [stemmer.stem(term) for term in self.query_terms]
        #calculate promise
        for term in self.query_terms:
            if term in link_terms_count:
                promise = promise + 0.1 * link_terms_count[term]
        return promise
Example #10
0
def wild(word_input):
    kg = kgram_index()
    w = wildcard()
    stemmer = Porter2Stemmer()

    ktokens = []
    wildcard_tokens = w.wildcard_parser(word_input)

    for token in wildcard_tokens:
        k = 0
        if len(token) > 3:
            k = 3
        else:
            k = len(token)
        ktokens.extend(kg.create_kgram(token, k))

    # remove '$' from tokens
    ktokens[:] = [x for x in ktokens if x != '$']

    canidate_lists = []

    for token in ktokens:
        if token in vocab:
            canidate_lists.append(vocab[token])
            print(token, list(vocab[token]), len(vocab))

    intersected_list = list(
        set(canidate_lists[0].intersection(*canidate_lists[1:])))

    n = list(map(lambda t: stemmer.stem(t), intersected_list))
    n = list(map(lambda t: index.get_index()[t], n))

    doc_list = []
    for p_list in n:
        for post in p_list:
            doc_list.append(post.get_document_id())
    # return list of docs for the word found
    return doc_list
Example #11
0
def preprocess(raw_text):
    lang = detect(raw_text)
    # 1. keep only words
    letters_only_text = raw_text
    # 2. convert to lower case and split
    words = letters_only_text.lower().split()
    # 3. remove \n
    break_free_words = words
    # [word.rstrip("\n") for word in words]
    # 5. lemmatize
    lemmatized_words = []
    print(break_free_words)
    if(lang == 'ru'):
        m = Mystem()
        for word in break_free_words:
            a = m.lemmatize(word) 
            lemmatized_words.append(a[0])
    else:
        stemmer = Porter2Stemmer()
        lemmatized_words = [stemmer.stem(word) for word in break_free_words]
    final = []
    for i in lemmatized_words:
        final.append(i)
    return final
Example #12
0
def Cleaner(text):

    # import/download relevant packages
    import string
    import nltk
    nltk.download('punkt')
    nltk.download('stopwords')

    # split into words
    from nltk.tokenize import word_tokenize
    tokens = word_tokenize(text)

    # convert to lower case
    tokens = [w.lower() for w in tokens]

    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]

    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]

    # filter out stop words
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]

    # stemming of words
    from porter2stemmer import Porter2Stemmer
    stemmer = Porter2Stemmer()
    stemmed = [stemmer.stem(word) for word in words]

    # final output
    global clean_text
    clean_text = stemmed
    return clean_text
Example #13
0
class IndexModule:
    data_list = None
    data_n = None

    stop_words = set()  # set of stop words
    stemmer = Porter2Stemmer()  # init porter stemmer

    config_path = None
    config = None

    conn = None

    def __init__(self):
        """
        init config, stop words, data list
        """
        self.config_path = 'config.ini'
        self.config = configparser.ConfigParser()
        self.config.read(self.config_path, 'utf-8')

        with open(self.config['DEFAULT']['STOPWORDS_PATH'],
                  encoding='utf-8') as f:
            self.stop_words = set(f.read().split())

        self.conn = sqlite3.connect(self.config['DEFAULT']['SE_DB_PATH'])
        self.conn.row_factory = sqlite3.Row
        c = self.conn.cursor()
        self.data_list = c.execute('select * from recipes').fetchall()
        self.data_n = len(self.data_list)

        # write DATA_N to config
        self.config.set('DEFAULT', 'DATA_N', str(self.data_n))
        with open(self.config_path, 'w', encoding='utf-8') as f:
            self.config.write(f)

    def __del__(self):
        """
        close the database
        :return:
        """
        self.conn.close()

    def write_index_to_db(self, index, table_name):
        """
        write inverted index to db
        index in form of # form: {term: [df, [posting, ...]], ...}
        :param table_name:
        :param index:
        :return:
        """
        conn = sqlite3.connect(self.config['DEFAULT']['SE_DB_PATH'])
        c = conn.cursor()

        c.execute('drop table if exists %s' % table_name)
        c.execute(
            'create table %s (term text primary key, df integer, postings text)'
            % table_name)

        for key, value in index.items():
            posting_list = '\n'.join(map(str, value[1]))
            values = (key, value[0], posting_list)
            c.execute('insert into %s values (?, ?, ?)' % table_name, values)

        conn.commit()
        conn.close()

    def data_cleanup_tf(self, data):
        """
        clean data and construct tf dictionary
        :param data:
        :return: length of data and tf dictionary
        """
        tf_dict = {}  # {term: tf, ...}
        n = 0  # length of data

        terms = data.lower().split()  # lower the data and split
        for term in terms:
            # filter stop words having quotation marks
            # filter sites in a simple way
            if (term not in self.stop_words) and ('http' not in term) and (
                    'www' not in term):
                term = re.sub(r'[^a-z]', '',
                              term)  # remove non-alphabetic letters
                # filter stop words again and blank term
                if (term not in self.stop_words) and (len(term) != 0):
                    term = self.stemmer.stem(term)  # stemming
                    n += 1
                    if term in tf_dict:
                        tf_dict[term] += 1
                    else:
                        tf_dict[term] = 1
        return n, tf_dict

    def construct_index_name_desc_ing(self):
        """
        construct inverted index with name and description
        :return:
        """
        inverted_index = {}  # form: {term: [df, [posting, ...]], ...}
        AVG_LEN = 0  # average length for name and description

        for recipe in self.data_list:
            rid = recipe['id']
            name = recipe['name']
            description = recipe['description']
            ingredients = recipe['ingredients']

            length, term_tf = self.data_cleanup_tf(name + ' ' + description +
                                                   ' ' + ingredients)
            AVG_LEN += length
            for term, tf in term_tf.items():
                posting = Posting(rid, tf, length)
                if term in inverted_index:
                    inverted_index[term][0] += 1  # df++
                    inverted_index[term][1].append(
                        posting)  # add posting to list
                else:
                    inverted_index[term] = [1, [posting]]  # [df, [posting]]

        AVG_LEN /= self.data_n
        # print(len(inverted_index), inverted_index)

        # write AVG_LEN to config
        self.config.set('DEFAULT', 'AVG_LEN', str(AVG_LEN))
        with open(self.config_path, 'w', encoding='utf-8') as f:
            self.config.write(f)

        self.write_index_to_db(inverted_index, 'index_name_desc_ing')

    def construct_index_name(self):
        """
        construct inverted index with name
        :return:
        """
        inverted_index = {}  # form: {term: [df, [posting, ...]], ...}

        for recipe in self.data_list:
            rid = recipe['id']
            name = recipe['name']

            length, term_tf = self.data_cleanup_tf(name)
            for term, tf in term_tf.items():
                posting = Posting(rid, tf, length)
                if term in inverted_index:
                    inverted_index[term][0] += 1  # df++
                    inverted_index[term][1].append(
                        posting)  # add posting to list
                else:
                    inverted_index[term] = [1, [posting]]  # [df, [posting]]
        # print(len(inverted_index), inverted_index)

        self.write_index_to_db(inverted_index, 'index_name')

    def construct_index_ingredient(self):
        """
        construct inverted index with ingredient
        :return:
        """
        inverted_index = {}  # form: {term: [df, [posting, ...]], ...}

        for recipe in self.data_list:
            rid = recipe['id']
            ing = recipe['ingredients']

            length, term_tf = self.data_cleanup_tf(ing)
            for term, tf in term_tf.items():
                posting = Posting(rid, tf, length)
                if term in inverted_index:
                    inverted_index[term][0] += 1  # df++
                    inverted_index[term][1].append(
                        posting)  # add posting to list
                else:
                    inverted_index[term] = [1, [posting]]  # [df, [posting]]

        self.write_index_to_db(inverted_index, 'index_ingredient')
Example #14
0
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from textblob import TextBlob as tb

import os
import networkx
from nltk.stem.porter import *
import scipy.stats
import statistics
from statistics import mean
from statistics import stdev
import numpy as np
from nltk import ngrams
from os import walk
ps = Porter2Stemmer()
import collections
import time
from nltk import ngrams
from nltk.tokenize import RegexpTokenizer
stop_words = set(stopwords.words('english'))

# In[185]:


class Node:
    def __init__(self, author_name, count):  #creat constructor
        self.AuthorName = author_name
        self.Count = count
        self.Impact = 0
Example #15
0
def index_file(directory, file_name, documentID, index):
    stemmer = Porter2Stemmer()
    # Dealing with punctuation
    p = dict.fromkeys(string.punctuation)
    p.pop('-')  # we need to deal with hyphens
    weight_map = {}

    try:
        with open(directory + file_name) as txt_file:
            # Trying to normalize the vocab, getting rid of non alphanumeric,
            body = txt_file.read().replace('\n', '').lower()
            body = re.sub(r'[^A-Za-z0-9#]+', ' ', body)
            body = body.split(
                ' ')  # Gets rid of any \n that appear in the text

            body = list(
                filter(lambda t: t != '' and t != '-',
                       body))  # remove single spaces and single hyphens

            position = 0
            for term in body:
                # take care of hyphenated words
                if '-' in term:
                    unhyphenated_word = term.replace('-', '')
                    index.add_term(stemmer.stem(unhyphenated_word), documentID,
                                   position)
                    hyphened_tokens = term.split('-')
                    for t in hyphened_tokens:
                        all_docs_index.add_term(stemmer.stem(t), documentID,
                                                position)
                else:
                    index.add_term(stemmer.stem(term), documentID, position)
                position += 1
                if term not in weight_map:
                    weight_map[term] = 1
                else:
                    weight_map[term] = weight_map[term] + 1

    except FileNotFoundError as e:
        print(e)

    doc_total_tf[
        file_name] = weight_map  # given a document, it will return a map of that docs tf

    score_map = {}
    wdt = 0
    # Gets the Wdt's of the terms in the file
    for tf in weight_map:
        score = pow(1 + log(weight_map[tf]), 2)
        score_map[tf] = score
        wdt += score**2
    Ld = sqrt(wdt)
    length = 0
    for tf in score_map:
        score_map[tf] = score_map[tf] / Ld
        length += score_map[tf]**2

    doc_wdt[file_name] = score_map
    # Things to turn in for Neal, it's just the easiest place to put this
    if file_name == 'paper_52.txt':
        print('First 30 components of document 52')
        get_first_thirty(score_map, True)
Example #16
0
class crToPG(object):

    stemmer = Porter2Stemmer()

    #remove stop words
    def remove_stop_words(self, frequency_list):

        stop_words = []
        #stop_words = get_stop_words('en')
        with open("stop_words_list.txt") as stop_words_list:
            #stop_words = [line.split(',') for line in stop_words_list if line.strip()]
            stop_words_data = stop_words_list.read()
            stop_words = stop_words_data.split(',')
        #frequency_list=frequency_list.split(" ")

        temp_list = []
        stemmer = Porter2Stemmer()
        #print(frequency_list)
        #stop_words.append(stop_words_data)
        #print(stop_words)
        try:
            '''for key in frequency_list:
				#print(key)
        			if key.lower() not in stop_words:
					
    							#print (key)
							
							#key = self.remove_custom_stop_words(key)
							temp_list.append([key])
							
				else:
					print("Removing:"+str(key))'''
            querywords = frequency_list.split()

            for stemwords in querywords:
                print(stemwords, stemmer.stem(stemwords))

            resultwords = [
                word for word in querywords if word.lower() not in stop_words
            ]
            result = ' '.join(resultwords)

            #print result

        except ValueError:
            print "no such value"

        #print(result)
        return result

    def remove_custom_stop_words(self, word_list):

        #stop_words_lst = ['yo', 'so', 'well', 'um', 'a', 'the', 'you know', 'i mean']
        with open("stop_words_list.txt") as stop_words_list:
            stop_words_list = [
                line.split('\n') for line in stop_words_list if line.strip()
            ]
            '''for word in stop_words_list:

    			pattern = r'\b'+word[0]+r'\b'

    			word_list = re.sub(pattern, '', word_list)'''

        try:
            if word_list not in stop_words_list:
                return word_list
            else:
                print("Match found:" + str(word_list))
        except ValueError:
            print "no such value"

    def ingest(self, crfile, pagestack, billstack, speechstack, speechstack1):
        """
        Break a crdoc into three parts
        Pass the appropriate rows for each part
        to the right stack for a bulk insert.
        """
        #print(crfile)
        page_row = OrderedDict([('pageid', crfile['id']),
                                ('title', rd(crfile['doc_title'])),
                                ('chamber', crfile['header']['chamber']),
                                ('extension', crfile['header']['extension']),
                                ('cr_day', crfile['header']['day']),
                                ('cr_month', crfile['header']['month']),
                                ('cr_year', crfile['header']['year']),
                                ('num', crfile['header']['num']),
                                ('vol', crfile['header']['vol']),
                                ('wkday', crfile['header']['wkday'])])
        # Add the "page" level to the page stack first
        pagestack.add(page_row)

        bills = []
        if 'related_bills' in list(crfile.keys()):
            for bill in crfile['related_bills']:
                bill_row = OrderedDict([('congress', bill['congress']),
                                        ('context', bill['context']),
                                        ('bill_type', bill['type']),
                                        ('bill_no', bill['number']),
                                        ('pageid', crfile['id'])])
                bills.append(bill_row)

        # Bills for the bill god!
        billstack.add(bills)

        #speeches = []
        ''' for speech in crfile['content']:
            if speech['kind'] == 'speech':
                speechid = crfile['id'] + '-' + str(speech['turn'])
		test = 'test string'
                speech_row = OrderedDict([('speechid',speechid),
                              ('speaker',speech['speaker']),
                              ('speaker_bioguide',speech['speaker_bioguide']),
                              ('pageid',crfile['id']),
                              ('text',rd(speech['text'])),
                              ('turn',speech['turn']),
			      ('party',test)
                             ]) # Gotta get rid of delimiter char
                speeches.append(speech_row)'''
        speeches_republican = []
        speeches_democratic = []
        #speech_row_D =[]
        #speech_row_R =[]
        democratic_data_output = ''
        republican_data_output = ''

        for speech in crfile['content']:

            if speech['kind'] == 'speech':
                #speechid = crfile['id'] + '-' + str(speech['turn'])
                #test = 'anannya'
                #print(speech)
                import json
                #print(speech['speaker_bioguide'])
                #print(rd(speech['text']))

                v = str(speech['speaker_bioguide']) + "||" + str(
                    rd(speech['text']))
                '''with open('speeches_test','a+') as out_json:
                    json.dump(v,out_json)'''
                if speech['speaker_bioguide']:
                    keybioguideid = speech['speaker_bioguide']
                    outpath = os.path.join('', 'json', keybioguideid + '.json')
                    #print(outpath)
                    #outpath = 'json\\'+keybioguideid+'.json'
                    with open(outpath) as json_data:
                        d = json.load(json_data)
                    #print(rd(speech['text']))
                    #print('*****************************************************************************************************************************************')

                    #print(speech_remove_sort_words)
                    if d['party'] == 'D':
                        speech_row_D = []

                        speech_remove_stop_words = []
                        speech_remove_stop_words = self.remove_stop_words(
                            rd(speech['text']))
                        #print(d['party'])

                        current_speaker_data = '\n\n' + speech_remove_stop_words
                        democratic_data_output = democratic_data_output + current_speaker_data + '\n'

                        speech_row_D = OrderedDict([
                            #('speechid',speechid),
                            ('affiliation', 'Affiliation:' + d['party']),
                            ('speaker', speech['speaker']),
                            #('speaker_bioguide',speech['speaker_bioguide']),
                            #('pageid',crfile['id']),
                            ('text', speech_remove_stop_words),
                            #('turn',speech['turn'])
                        ])
                        speeches_democratic.append(speech_row_D)
                        '''if len(speech_remove_stop_words):
					#print(speech_remove_stop_words)
					speeches_democratic.append(speech_row_D)
				else:
					pass
				#print(str(keybioguideid) + "D")'''

                    elif d['party'] == 'R':
                        speech_row_D = []
                        speech_remove_stop_words = []
                        speech_remove_stop_words = self.remove_stop_words(
                            rd(speech['text']))

                        #print(speech_remove_stop_words,stemmer.stem(speech_remove_stop_words))

                        current_speaker_data = '\n\n' + speech_remove_stop_words
                        republican_data_output = republican_data_output + current_speaker_data + '\n'
                        speech_row_R = OrderedDict([
                            #('speechid',speechid),
                            ('affiliation', 'Affiliation:' + d['party']),
                            ('speaker', speech['speaker']),
                            #('speaker_bioguide',speech['speaker_bioguide']),
                            #('pageid',crfile['id']),
                            #('text',''),
                            ('text', speech_remove_stop_words),
                            #('turn',speech['turn'])
                        ])
                        speeches_republican.append(speech_row_R)
                        #print(str(keybioguideid) + "R")
                        '''if len(speech_remove_stop_words):
					#print(speech_remove_stop_words)
			      		speeches_republican.append(speech_row_R)
			      else:
					pass'''

                else:
                    keybioguideid = 'dummy'
                    #print(str(keybioguideid))

                #pr.find_people(pr(),'','')

                # SPEECHES FOR THE SPEECH THRONE
                #print(speeches_republican)
                #print(speeches_democratic)
                #print(democratic_data_output)

                speechstack.add(speeches_republican)
                speechstack1.add(speeches_democratic)
        import json
        #print(democratic_data_output)
        with open('democratic_speeches.txt', 'a+') as out_json:
            out_json.write(democratic_data_output)
        with open('republican_speeches.txt', 'a+') as out_json:
            out_json.write(democratic_data_output)

    def find_people(self):
        mbrs = self.doc_ref.find_all('congmember')
        if mbrs:
            for mbr in mbrs:
                self.speakers[mbr.find('name',
                                     {'type':'parsed'}).string] = \
                                     self.people_helper(mbr)

    '''def people_helper(self,tagobject):
        output_dict = {}
        if 'bioguideid' in tagobject.attrs:
            output_dict['bioguideid'] = tagobject['bioguideid']
        elif 'bioGuideId' in tagobject.attrs:
            output_dict['bioguideid'] = tagobject['bioGuideId']
        else:
            output_dict['bioguideid'] = 'None'
        for key in ['chamber','congress','party','state','role']:
            if key in tagobject.attrs:
                output_dict[key] = tagobject[key]
            else:
                output_dict[key] = 'None'
        try:
            output_dict['name_full'] = tagobject.find('name',{'type':'authority-fnf'}).string
        except:
            output_dict['name_full'] = 'None'
	#print(output_dict)
        return output_dict

    # Flow control for metadata generation
    def gen_file_metadata(self):
        # Sometimes the searchtitle has semicolons in it so .split(';') is a nogo
        temp_ref = self.cr_dir.mods.find('accessid', text=self.access_path)
        if temp_ref is None:
            raise RuntimeError("{} doesn't have accessid tag".format(self.access_path))
        self.doc_ref = temp_ref.parent
        matchobj = re.match(self.re_vol, self.doc_ref.searchtitle.string)
        if matchobj:
            self.doc_title, self.cr_vol, self.cr_num = matchobj.group('title','vol','num')
        else:
            logging.warn('{0} yields no title, vol, num'.format(
                self.access_path))
            self.doc_title, self.cr_vol, self.cr_num = \
              'None','Unknown','Unknown'
        self.find_people()
        self.find_related_bills()
        self.find_related_laws()
        self.find_related_usc()
        self.find_related_statute()
        self.date_from_entry()
        self.chamber = self.doc_ref.granuleclass.string
        self.re_newspeaker = self.make_re_newspeaker()
        self.item_types['speech']['patterns'] = [self.re_newspeaker]'''

    def __init__(self, start, **kwargs):
        """
        BE SURE TO INCLUDE do_mode='yield' in kwargs!
        This object handles flow control for new data
        entering a Postgres database using congressionalrecord2s
        data model.

        It breaks the incoming Python dictionaries into three stacks
        of rows, one for each table in this data model.

        It writes the results to each of three flatfiles suitable for
        a bulk update through COPY.

        This is the way to minimize the number
        of transactions to the database, which we want.
        """
        kwargs['do_mode'] = 'yield'
        if 'csvpath' in kwargs:
            pass
        else:
            kwargs['csvpath'] = 'dbfiles'
        pagepath, billpath, speechpath, speechpath1 = [
            os.path.join(kwargs['csvpath'], filename) for filename in
            ['pages.csv', 'bills.csv', 'speeches_R.csv', 'speeches_D.csv']
        ]
        self.downloader = dl(start, **kwargs)
        self.doc_ref = ''
        memberlistfinal = []
        #object1 = congressionalrecord.fdsys.cr_parser.ParseCRDir()
        #print(object1)
        #self.cr_dir = '<congressionalrecord.fdsys.cr_parser.ParseCRDir object at 0x7f0c7c88cb90>'
        #self.cr_dir=cr_dir
        #self.gen_file_metadata()
        #print(pr.find_people(pr(self,'')))
        #self.find_people()
        #print('anannya'+str(pr.memberlist))
        #print(pr('/home/anannyadas/Desktop/congress/congressional-record-master/congressionalrecord/pg_run/fdsys'))
        self.page_fields = [
            'pageid', 'title', 'chamber', 'extension', 'cr_day', 'cr_month',
            'cr_year', 'num', 'vol', 'pages', 'wkday'
        ]
        self.bill_fields = [
            'congress', 'context', 'bill_type', 'bill_no', 'pageid'
        ]
        #self.speech_fields = ['speechid','affiliation','speaker','speaker_bioguide','pageid','text','turn']
        self.speech_fields = ['affiliation', 'speaker', 'text']
        pagestack = crPages(pagepath, self.page_fields)
        billstack = crBills(billpath, self.bill_fields)
        speechstack = crSpeeches(speechpath, self.speech_fields)
        speechstack1 = crSpeeches(speechpath1, self.speech_fields)
        for crfile in self.downloader.yielded:
            #print(crfile)
            doc = crfile.crdoc
            self.ingest(doc, pagestack, billstack, speechstack, speechstack1)
            # pagestack.write()
            # billstack.write()
            speechstack.write()
            speechstack1.write()
Example #17
0
class ParseCRFile(object):
    # Some regex
    re_time = r'^CREC-(?P<year>[0-9]{4})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})-.*'
    re_vol = r'^(?P<title>.*); Congressional Record Vol. (?P<vol>[0-9]+), No. (?P<num>[0-9]+)$'
    re_vol_file =   r'^\[Congressional Record Volume (?P<vol>[0-9]+), Number (?P<num>[0-9]+)'\
                    + r' \((?P<wkday>[A-Za-z]+), (?P<month>[A-Za-z]+) (?P<day>[0-9]+), (?P<year>[0-9]{4})\)\]'
    re_chamber =  r'\[(?P<chamber>[A-Za-z\s]+)\]'
    re_pages =  r'\[Page[s]? (?P<pages>[\w\-]+)\]'
    re_trail = r'From the Congressional Record Online'\
      + r' through the Government (Publishing|Printing) Office \[www.gpo.gov\]$'
    re_rollcall =       r'\[Roll(call)?( Vote)? No. \d+.*\]'
    re_recorderstart =  (r'^\s+(?P<start>'
                        + r'(The (assistant )?legislative clerk read as follows)'
                        + r'|(The nomination considered and confirmed is as follows)'
                        + r'|(The (assistant )?legislative clerk)'
                        + r'|(The nomination was confirmed)'
                        + r'|(There being no objection, )'
                        + r'|(The resolution .*?was agreed to.)'
                        + r'|(The preamble was agreed to.)'
                        + r'|(The resolution .*?reads as follows)'
                        + r'|(The assistant editor .*?proceeded to call the roll)'
                        + r'|(The bill clerk proceeded to call the roll.)'
                        + r'|(The bill clerk called the roll.)'
                        + r'|(The motion was agreed to.)'
                        #+ r'|(The Clerk read the resolution, as follows:)'
                        + r'|(The Clerk read (the resolution, )as follows:)'
                        + r'|(The resolution(, with its preamble,)? reads as follows:)'
                        + r'|(The amend(ment|ed).*?(is)? as follows:)'
                        + r'|(Amendment No\. \d+.*?is as follows:)'
                        + r'|(The yeas and nays resulted.*?, as follows:)'
                        + r'|(The yeas and nays were ordered)'
                        + r'|(The result was announced.*?, as follows:)'
                        + r'|(The .*?editor of the Daily Digest)'
                        + r'|(The (assistant )?bill clerk read as follows:)'
                        + r'|(The .*?read as follows:)'
                        + r'|(The text of the.*?is as follows)'
                        + r'|(amended( to read)? as follows:)'
                        + r'|(The material (previously )?referred to (by.*?)?is as follows:)'
                        + r'|(There was no objection)'
                        + r'|(The amendment.*?was agreed to)'
                        + r'|(The motion to table was .*)'
                        + r'|(The question was taken(;|.))'
                        + r'|(The following bills and joint resolutions were introduced.*)'
                        + r'|(The vote was taken by electronic device)'
                        + r'|(A recorded vote was ordered)'
                        #+ r'|()'
                        + r').*')
    # anchored at the end of the line
    re_recorderend =    (r'('
                        + r'(read as follows:)'
                        + r'|(the Record, as follows:)'
                        + r'|(ordered to lie on the table; as follows:)'
                        + r'|(resolutions as follows:)'
                        + r')$')
    # sometimes the recorder says something that is not unique to them but
    # which, in the right context, we take to indicate a recorder comment.
    re_recorder_fuzzy = (r'^\s+(?P<start>'
                        + r'(Pending:)'
                        + r'|(By M(r|s|rs)\. .* \(for .*)'
                        #+ r'|()'
                        + r').*')
    # NCJ's broader version below, tested on one day of the record.
    # works, honest
    re_recorder_ncj = (r'^\s+(?P<start>'
                       + r'(Pending:)'
                       + r'|(By M(r|rs|s|iss)[\.]? [a-zA-Z]+))'
                       )
    re_clerk = r'^\s+(?P<start>The Clerk (read|designated))'
    re_allcaps = r'^ \s*(?!([_=]+|-{3,}))(?P<title>([A-Z]+[^a-z]+))$'
    re_linebreak = r'\s+([_=]+|-{5,})(NOTE|END NOTE)?([_=]+|-{5,})*\s*'
    re_excerpt = r'\s+(_{3,4})'
    re_newpage =   r'\s*\[\[Page \w+\]\]'
    re_timestamp = r'\s+\{time\}\s+\d{4}'

    
    # Metadata-making functions
    def title_id(self):
        id_num = self.num_titles
        self.num_titles += 1
        return id_num
        
    def make_re_newspeaker(self):
        speaker_list = '|'.join([mbr for mbr in list(self.speakers.keys()) \
        if self.speakers[mbr]['role'] == 'SPEAKING'])
        if len(speaker_list) > 0:
            re_speakers = r'^(\s{1,2}|<bullet>)(?P<name>((' + speaker_list + ')|(((Mr)|(Ms)|(Mrs)|(Miss))\. (([-A-Z\'])(\s)?)+( of [A-Z][a-z]+)?)|(((The ((VICE|ACTING|Acting) )?(PRESIDENT|SPEAKER|CHAIR(MAN)?)( pro tempore)?)|(The PRESIDING OFFICER)|(The CLERK)|(The CHIEF JUSTICE)|(The VICE PRESIDENT)|(Mr\. Counsel [A-Z]+))( \([A-Za-z.\- ]+\))?)))\.'
        else:
            re_speakers = r'^(\s{1,2}|<bullet>)(?P<name>((((Mr)|(Ms)|(Mrs)|(Miss))\. (([-A-Z\'])(\s)?)+( of [A-Z][a-z]+)?)|((The ((VICE|ACTING|Acting) )?(PRESIDENT|SPEAKER|CHAIR(MAN)?)( pro tempore)?)|(The PRESIDING OFFICER)|(The CLERK)|(The CHIEF JUSTICE)|(The VICE PRESIDENT)|(Mr\. Counsel [A-Z]+))( \([A-Za-z.\- ]+\))?))\.'
        return re_speakers
    
    def people_helper(self,tagobject):
        output_dict = {}
        if 'bioguideid' in tagobject.attrs:
            output_dict['bioguideid'] = tagobject['bioguideid']
        elif 'bioGuideId' in tagobject.attrs:
            output_dict['bioguideid'] = tagobject['bioGuideId']
        else:
            output_dict['bioguideid'] = 'None'
        for key in ['chamber','congress','party','state','role']:
            if key in tagobject.attrs:
                output_dict[key] = tagobject[key]
            else:
                output_dict[key] = 'None'
        try:
            output_dict['name_full'] = tagobject.find('name',{'type':'authority-fnf'}).string
        except:
            output_dict['name_full'] = 'None'
	#print(output_dict)
	#cr.memberlistfinal.append(output_dict)
	
	''' if 'json' not in os.listdir(outpath):
                    os.mkdir(os.path.join(outpath,'json'))'''
	with open('json/'+output_dict['bioguideid']+'.json','w+') as out_json:
                     json.dump(output_dict,out_json)
        return output_dict
    

        
    def find_people(self):
        mbrs = self.doc_ref.find_all('congmember')
	memberlist = mbrs
	#print(memberlist)
        if mbrs:
            for mbr in mbrs:
                self.speakers[mbr.find('name',
                                       {'type':'parsed'}).string] = \
                                       self.people_helper(mbr)
    
    
    def find_related_bills(self):
        related_bills = self.doc_ref.find_all('bill')
        if len(related_bills) > 0:
            self.crdoc['related_bills'] = \
              [bill.attrs for bill in related_bills]

    def find_related_laws(self):
        related_laws = self.doc_ref.find_all('law')
        if len(related_laws) > 0:
            self.crdoc['related_laws'] = \
              [law.attrs for law in related_laws]

    def find_related_usc(self):
        related_usc = self.doc_ref.find_all('uscode')
        if len(related_usc) > 0:
            self.crdoc['related_usc'] = list(
                itertools.chain.from_iterable(
                    [[dict([('title',usc['title'])] +
                        list(sec.attrs.items())) for sec
                        in usc.find_all('section')]
                        for usc in related_usc]
                    )
                )

    def find_related_statute(self):
        related_statute = self.doc_ref.find_all('statuteatlarge')
        if len(related_statute) > 0:
            self.crdoc['related_statute'] = list(
                itertools.chain.from_iterable(
                    [[dict([('volume',st['volume'])] +
                        list(pg.attrs.items())) for pg
                        in st.find_all('pages')]
                        for st in related_statute]
                    )
                )
        
    def date_from_entry(self):
        year, month, day = re.match(self.re_time,self.access_path).group('year','month','day')
        if self.doc_ref.time:
            from_hr,from_min,from_sec = self.doc_ref.time['from'].split(':')
            to_hr,to_min,to_sec = self.doc_ref.time['to'].split(':')
            try:
                self.doc_date = datetime(int(year),int(month),int(day))
                self.doc_start_time = datetime(int(year),int(month),int(day),\
                int(from_hr),int(from_min),int(from_sec))
                self.doc_stop_time = datetime(int(year),int(month),int(day),\
                int(to_hr),int(to_min),int(to_sec))
                self.doc_duration = self.doc_stop_time - self.doc_start_time
            except:
                logging.info('Could not extract a document timestamp.')
    
    # Flow control for metadata generation
    def gen_file_metadata(self):
        # Sometimes the searchtitle has semicolons in it so .split(';') is a nogo
        temp_ref = self.cr_dir.mods.find('accessid', text=self.access_path)
	
	#print(type(self.cr_dir))

        if temp_ref is None:
            raise RuntimeError("{} doesn't have accessid tag".format(self.access_path))
        self.doc_ref = temp_ref.parent
        matchobj = re.match(self.re_vol, self.doc_ref.searchtitle.string)
        if matchobj:
            self.doc_title, self.cr_vol, self.cr_num = matchobj.group('title','vol','num')
        else:
            logging.warn('{0} yields no title, vol, num'.format(
                self.access_path))
            self.doc_title, self.cr_vol, self.cr_num = \
              'None','Unknown','Unknown'
        self.find_people()
        self.find_related_bills()
        self.find_related_laws()
        self.find_related_usc()
        self.find_related_statute()
        self.date_from_entry()
        self.chamber = self.doc_ref.granuleclass.string
        self.re_newspeaker = self.make_re_newspeaker()
        self.item_types['speech']['patterns'] = [self.re_newspeaker]

    # That's it for metadata. Below deals with content.

    def read_htm_file(self):
        """
        This function updates a self.cur_line
        attribute. So now for each call to the iterator there are two
        pointers to the next line - one for the function,
        and one for the object.

        The purpose of the attribute is to
        give each parsing function a "starting position"
        so that the handshake between functions is easier. Now
        the current (or last) line is tracked in only one place
        and the same way by all object methods.
        """
        self.lines_remaining = True
        with open(self.filepath, 'r') as htm_file:
            htm_lines = htm_file.read()
            htm_text = BeautifulSoup(htm_lines,"lxml")
        text = htm_text.pre.text.split('\n')
        for line in text:
            self.cur_line = line
            yield line
        self.lines_remaining = False
    
    def get_header(self):
        """
        Only after I wrote this did I realize
        how bad things can go when you call
        next() on an iterator instead of treating
        it as a list.

        This code works, though.
        """
        header_in = next(self.the_text)
        if header_in == u'':
            header_in = next(self.the_text)
        match = re.match(self.re_vol_file, header_in)
        if match:
            vol, num, wkday, month, day, year = match.group( \
            'vol','num','wkday','month','day','year')
        else:
            return False
        header_in = next(self.the_text)
        match = re.match(self.re_chamber, header_in)
        if match:
            if match.group('chamber') == 'Extensions of Remarks':
                chamber = 'House'
                extensions = True
            else:
                chamber = match.group('chamber')
                extensions = False
        else:
            return False
        header_in = next(self.the_text)
        match = re.match(self.re_pages, header_in)
        if match:
            pages = match.group('pages')
        else:
            return False
        header_in = next(self.the_text)
        match = re.match(self.re_trail, header_in)
        if match:
            pass
        else:
            return False
        return vol, num, wkday, month, day, year, chamber, pages, extensions

    def write_header(self):
        self.crdoc['id'] = self.access_path
        header = self.get_header()
        if header:
            self.crdoc['header'] = {'vol':header[0],'num':header[1],\
            'wkday':header[2],'month':header[3],'day':header[4],\
            'year':header[5],'chamber':header[6],'pages':header[7],\
            'extension':header[8]}
        self.crdoc['doc_title'] = self.doc_title

    def get_title(self):
        """
        Throw out empty lines
        Parse consecutive title-matching strings into a title str
        Stop on the first line that isn't empty and isn't a title
        Return the title str if it exists.

        We pretty much assume the first title on the page applies
        to everything below it
        """

        title_str = ''
        for line in self.the_text:
            if line == u'':
                pass
            else:
                a_match = re.match(self.re_allcaps, line)
                if a_match:
                    title_str = ' '.join([title_str,a_match.group('title')])
                else:
                    break

        if len(title_str) > 0:
            return title_str.strip()
        else:
            return False

    def write_page(self):
        turn = 0
        itemno = 0
        title = self.get_title()
        the_content = []
        if title:
            self.crdoc['title'] = title
        else:
            self.crdoc['title'] = None
        while self.lines_remaining:
            # while not re.match(self.re_allcaps,self.cur_line):
            try:
                item = crItem(self).item
                if item['kind'] == 'speech':
                    item['turn'] = turn
                    turn += 1
                item['itemno'] = itemno
                itemno += 1
                the_content.append(item)
            except Exception as e:
                logging.warn('{0}'.format(e))
                break

        self.crdoc['content'] = the_content

        logging.debug('Stopped writing {0}. The last line is: {1}'.format(self.access_path,self.cur_line))

    def parse(self):
        """
        Flow control for parsing content.
        """
        self.the_text = self.read_htm_file()
        self.write_header()
        self.write_page()

    """
    This is a dict of line cases.
    In previous versions, these relations were called
    explicitly multiple times in multiple places.

    This way is more extensible and easier to track cases.

    Usage:
    If break_flow == True: <interrupt current item>
    If speaker_re == True: speaker = re.match(line,
                                     <pattern from patterns>).
                                     .group(<speaker_group>)
    else: speaker = <speaker>
    (ALSO -- see line 176 for how speech patterns is populated)
    It has to come after some of the functions because of
    how I want to handle special cases.
    """
    item_types = { 'speech':
                   {'patterns':['Mr. BOEHNER'],
                    'speaker_re':True,
                    'speaker_group':'name',
                    'break_flow':True,
                    'special_case':False
                    },
                    'recorder':
                    {'patterns':[re_recorderstart,
                                 re_recorderend,
                                 re_recorder_ncj],
                    'speaker_re':False,
                    'speaker':'The RECORDER',
                    'break_flow':True,
                    'special_case':False
                    },
                    'clerk':
                    {'patterns':[re_clerk],
                     'speaker_re':False,
                     'speaker':'The Clerk',
                     'break_flow':True,
                     'special_case':False
                     },
                     'linebreak':
                     {'patterns':[re_linebreak],
                      'speaker_re':False,
                      'speaker':'None',
                      'break_flow':True,
                      'special_case':True,
                      'condition':'emptystr'
                      },
                      'excerpt':
                      {'patterns':[re_excerpt],
                       'speaker_re':False,
                       'speaker':'None',
                       'break_flow':True,
                       'special_case':True,
                       'condition':'lastspeaker'
                       },
                      'rollcall':
                      {'patterns':[re_rollcall],
                      'speaker_re':False,
                      'speaker':'None',
                      'break_flow':True,
                      'special_case':False
                      },
                      'metacharacters':
                      {'patterns':[re_timestamp,
                                   re_newpage],
                       'speaker_re':False,
                       'speaker':'None',
                       'break_flow':False,
                       'special_case':False
                       },
                      'empty_line':
                      {'patterns':[r'(^[\s]+$)'],
                       'speaker_re':False,
                       'speaker':'None',
                       'break_flow':False,
                       'special_case':False
                       },
                       'title':
                       {'patterns':[re_allcaps],
                        'speaker_re':False,
                        'speaker':'None',
                        'break_flow':True,
                        'special_case':False,
                        }
                    }

    stemmer = Porter2Stemmer()

    #remove stop words
    def remove_stop_words(self,frequency_list):

	stop_words = []

	with open("stop_words_list.txt") as stop_words_list:

		stop_words_data = stop_words_list.read()
		stop_words = stop_words_data.split(',')


    		temp_list = []
		stemmer = Porter2Stemmer()
		try:
			for c in string.punctuation:
		   		frequency_list= frequency_list.replace(c,"")

			querywords = frequency_list.split()
		
			resultwords  = [stemmer.stem(word) for word in querywords if word.lower() not in stop_words]

			result = ' '.join(resultwords)

		except ValueError:
	     		print "no such value"

	#print(result)    	
	return result


    def __init__(self, abspath, cr_dir, **kwargs):

        # Some metadata
        self.crdoc = {}
        self.crdoc['header'] = False
        self.crdoc['content'] = []
        self.num_titles = 0
        self.speakers = {}
        self.doc_ref = ''
        self.doc_time = -1
        self.doc_start_time = -1
        self.doc_stop_time = -1
        self.doc_duration = -1
        self.doc_chamber = 'Unspecified'
        self.doc_related_bills = []
        
        # file data
        self.filepath = abspath
        self.filedir, self.filename = os.path.split(abspath)
        self.cr_dir = cr_dir
	#print(cr_dir)
        self.access_path = self.filename.split('.')[0]

        # Generate all metadata including list of speakers
        self.gen_file_metadata()
        # Must come after speaker list generation
        self.item_breakers = []
        self.skip_items = []
        for x in list(self.item_types.values()):
            if x['break_flow'] == True:
                self.item_breakers.extend(x['patterns'])
            else:
                self.skip_items.extend(x['patterns'])

        # Parse the file
        self.parse()
	#print( self.crdoc['content'])
	

	for speech in self.crdoc['content']:
		#print("Code running")
		if speech['kind'] == 'speech':
			#print(speech['text'])
			if speech['speaker_bioguide']:
				keybioguideid = speech['speaker_bioguide']
				outpath = os.path.join('','json',keybioguideid+'.json')
			
				with open(outpath) as json_data:
    					d = json.load(json_data)
			
				if d['party']=='D':
					print("D")
					with open('democratic/'+speech['speaker']+'-'+str(keybioguideid)+'.txt','a+') as out_json:
						out_json.write(str(self.remove_stop_words(speech['text']))+'\n')
					os.chmod('democratic/'+speech['speaker']+'-'+str(keybioguideid)+'.txt', 0o777)
				
					with open('democratic_speeches.txt','a+') as out_json:
						out_json.write(str(self.remove_stop_words(speech['text']))+'\n')

				elif d['party'] =='R':
					print("R")
					with open('republican/'+speech['speaker']+'-'+str(keybioguideid)+'.txt','a+') as out_json:
						out_json.write(str(self.remove_stop_words(speech['text']))+'\n')
					os.chmod('republican/'+speech['speaker']+'-'+str(keybioguideid)+'.txt', 0o777)

					with open('republican_speeches.txt','a+') as out_json:
						out_json.write(str(self.remove_stop_words(speech['text']))+'\n')


	'''
Example #18
0
def index_file(file_name, documentID):
    stemmer = Porter2Stemmer()
    k = kgram_index()

    # Dealing with punctuation
    p = dict.fromkeys(string.punctuation)
    p.pop('-')  # we need to deal with hyphens
    punctuation = str.maketrans(p)
    weight_map = {}

    try:
        with open(file_name) as json_file:
            article_data = json.load(json_file)
            body = unidecode.unidecode(
                article_data['body']).lower().translate(punctuation).split(' ')
            body = list(
                filter(lambda t: t != '' and t != '-',
                       body))  # remove single spaces and single hyphens

            #kgram stuff here

            position = 0
            for term in body:
                #kgram stuff here
                kgram_list = []
                # develop a list of kgram tokens for one specific term
                # kgram doesn't need to deal with hyphens because the tokens will be created anyways
                for i in range(1, 4):
                    if i is 1:
                        kgram_list.extend(k.create_kgram(term, i))
                    else:
                        s = ('$' + term + '$')
                        kgram_list.extend(k.create_kgram(s, i))
                # Shove each of those tokens into the grand vocab dictionary
                for token in kgram_list:
                    if token in vocab:
                        vocab[token].add(term)
                    else:
                        vocab[token] = set([term])

                # take care of hyphenated words
                if '-' in term:
                    unhyphenated_word = term.replace('-', '')
                    index.add_term(stemmer.stem(unhyphenated_word), documentID,
                                   position)
                    hyphened_tokens = term.split('-')
                    for t in hyphened_tokens:
                        index.add_term(stemmer.stem(t), documentID, position)
                else:
                    index.add_term(stemmer.stem(term), documentID, position)
                position += 1
                if term not in weight_map:
                    weight_map[term] = 1
                else:
                    weight_map[term] = weight_map[term] + 1

    except FileNotFoundError as e:
        print(e)

    wdt = 0
    i_writer = index_writer()
    # Gets the Wdt's of the terms in the file
    for tf in weight_map:
        wdt += pow(1 + log(weight_map[tf]), 2)
    Ld = sqrt(wdt)
    i_writer.write_ld(Ld)
Example #19
0
def main():
    # Instances
    # w = wildcard()
    n = near()
    # directory = input('Enter directory for index: ')  # TODO Revert back to original when done

    # TODO This is for testing purposes, so i can compare output
    # test_dir = '/Users/Cemo/Documents/cecs429/search_engine/corpus/mlb_documents'
    # test_dir = '/Users/Cemo/Documents/cecs429/search_engine/corpus/kumin'
    # test_dir = '/Users/Cemo/Documents/cecs429/search_engine/corpus/disk_test'
    cwd = getcwd()
    start_time = time.time()

    corpus_size = len(
        listdir(
            '/Users/Cemo/Documents/cecs429/search_engine/corpus/all-nps-sites')
    )

    # init(test_dir)
    print("--- %s seconds ---" % str((time.time() - start_time) / 60))

    while 1:
        chdir(
            cwd
        )  # Changing to the directory of with the DB file in it for sqlite
        query_or_index = input('[1] - Query\n[2] - Index\n')
        print(query_or_index)
        if query_or_index == '1':

            query_type = input('[1] - Rank\n[2] - Boolean\n')
            if query_type == '1':
                r = rank()
                q = input('Enter query: ')
                r.get_rank(q, corpus_size)
                # print(r.get_rank(q, corpus_size))

                # print(r.get_rank('wildfire in yosemite', corpus_size))
            else:
                return_docs = []

                user_string = input("Please enter a word search:\n")
                # Special Queries
                if ':' in user_string:
                    if ':q' in user_string:
                        exit()
                    if ':stem' in user_string:
                        stemmer = Porter2Stemmer()
                        print("Will be stemming the token")
                        print(user_string.split(" ")[1])
                        print(stemmer.stem(user_string.split(" ")[1]))
                    if ':index' in user_string:
                        print('Will be indexing folder')
                        init(user_string.split(" ")[1].rstrip().lstrip())
                    if ':vocab' in user_string:
                        pp = pprint.PrettyPrinter(indent=4)
                        pp.pprint(index.get_dictionary())
                        print('Total number of vocabulary terms: ' +
                              str(index.get_term_count()))
                        print('Will be spitting out words')
                elif '*' in user_string:
                    print("This will get sent of to the wildcard class")
                    return_docs.extend(wild(user_string))
                elif 'near' in user_string:
                    # Parse NEAR input
                    near_parts = user_string.split(' ')
                    k = near_parts[1].split('/')
                    return_docs.extend(
                        n.near(index, near_parts[0], near_parts[2], int(k[1])))
                else:
                    if user_string:
                        q = Query()
                        return_docs = q.query_parser(user_string)
                    else:
                        print('No query entered')

                print('DOC_LIST: ' + str(return_docs))

                # Allow the user to select a document to view
                doc_list = list(map(document_parser, return_docs))
                if len(doc_list) != 0:
                    for document in doc_list:
                        print('Document ' + document)
                    print('Documents found: ' + str(len(doc_list)))
                    document_selection = input(
                        'Please select a document you would like to view: ')
                    while document_selection != 'no':
                        if document_selection in doc_list:
                            open_file_content(document_selection)
                        document_selection = input(
                            'Please select a document you would like to view: '
                        )
                else:
                    print('No documents were found')
        else:
            print('Please dont')
            directory = input('Enter directory for index: '
                              )  # TODO Revert back to original when done
            init(directory)
            i_writer = index_writer()
            i_writer.write_index_to_disk(index.get_index())
Example #20
0
def stemming():
    stemmer = Porter2Stemmer()
    for w in words():
        print(w, stemmer.stem(w), sep='\t')
import re
from porter2stemmer import Porter2Stemmer
from random import shuffle
import math
import numpy as np
import matplotlib.pyplot as plt

##### Stop words
stoplist = []
f = open("/home/sree/Machine-Learning-course/Assignment2/stopwords.txt",'r')
for line in f: 
	stoplist.append(line.replace("\n",""))
#print stoplist

stemmer = Porter2Stemmer()


##### Remove stop words from messages and form 2-d list
wosw = []
f = open("/home/sree/Machine-Learning-course/Assignment2/Assignment_2_data.txt",'r')
for line in f: 
	z = re.split('\t| |;|,|\*|\n|\.',line)
	b = filter(lambda a: a != '', z)
	l3 = [stemmer.stem(x) for x in b if x.lower() not in stoplist]
	wosw.append(l3)
shuffle(wosw)
#print wosw

##### Make a list of tokens
tokens = []
for row in wosw:
Example #22
0
def test_index_txt_file():
    txt_index = positional_inverted_index()
    stemmer = Porter2Stemmer()
    k = kgram_index()

    file_names = []
    documentID = 1

    # Dealing with punctuation
    p = dict.fromkeys(string.punctuation)
    p.pop('-') # we need to deal with hyphens
    punctuation = str.maketrans(p)

    directory = path.dirname(path.realpath(__file__)) + '/unit_test_docs/'
    chdir(directory)

    for file in listdir(directory):
        if file.endswith('.txt'):
            file_names.append(str(file))

    for file in file_names:
        try:
            with open(file) as txt_file:

                content = txt_file.readlines();
                content = content[0].lower().translate(punctuation).split(' ')
                content = list(filter(lambda w: w != '', map(lambda s: s.strip(), content)))

                positions_dict = {}
                for i in range(0, len(content)):
                    if '-' in content[i]:

                        hyphened_word_parts = content[i].split('-')
                        hyphened_word = content[i].replace('-', '')
                        hyphened_word_parts.append(hyphened_word)

                        for word in hyphened_word_parts:
                            if word in positions_dict:
                                positions_dict[word].append(i)
                            else:
                                positions_dict[word] = [i]
                    else:

                        if content[i] in positions_dict:
                            positions_dict[content[i]].append(i)
                        else:
                            positions_dict[content[i]] = [i]


                for key in positions_dict:
                    txt_index.add_term(stemmer.stem(key), documentID, positions_dict[key])
        except FileNotFoundError as e:
            i = 0
            print(e)

        documentID = documentID + 1

    for key in txt_index.get_index():
        txt_index.print_term_info(key)

    correct_map = {}
    correct_map['today'] = [posting(1, [0]), posting(2, [0]), posting(3, [0])]
    correct_map['i'] = [posting(1, [1, 6, 11]), posting(2, [1]), posting(3, [1]), posting(4, [0])]
    correct_map['fell'] = [posting(1, [2])]
    correct_map['in'] = [posting(1, [3])]
    correct_map['a'] = [posting(1, [4])]
    correct_map['well'] = [posting(1, [5])]
    correct_map['have'] = [posting(1, [7]), posting(4, [1])]
    correct_map['no'] = [posting(1, [8]), posting(5, [8])]
    correct_map['mouth'] = [posting(1, [9])]
    correct_map['but'] = [posting(1, [10])]
    correct_map['want'] = [posting(1, [12])]
    correct_map['to'] = [posting(1, [13])]
    correct_map['scream'] = [posting(1, [14])]
    correct_map['top'] = [posting(2, [2])]
    correct_map['deck'] = [posting(2, [3])]
    correct_map['lethal'] = [posting(2, [4])]
    correct_map['yogg'] = [posting(2, [5])]
    correct_map['saron'] = [posting(2, [5])]
    correct_map['yoggsaron'] = [posting(2, [5])]
    correct_map['f**k'] = [posting(2, [6])]
    correct_map['me'] = [posting(2, [7]), posting(4, [8])]
    correct_map['over'] = [posting(2, [8])]
    correct_map['super'] = [posting(2, [9])]
    correct_map['hard'] = [posting(2, [10])]
    correct_map['learn'] = [posting(3, [2])]
    correct_map['the'] = [posting(3, [3]), posting(5, [2])]
    correct_map['mean'] = [posting(3, [4])]
    correct_map['of'] = [posting(3, [5])]
    correct_map['pain'] = [posting(3, [6])]
    correct_map['it'] = [posting(3, [7]), posting(4, [9]), posting(5, [12])]
    correct_map['was'] = [posting(3, [8]), posting(4, [10])]
    correct_map['all'] = [posting(3, [9])]
    correct_map['caus'] = [posting(3, [10])]
    correct_map['by'] = [posting(3, [11])]
    correct_map['nealdt'] = [posting(3, [12])]
    correct_map['ascend'] = [posting(4, [2])]
    correct_map['into'] = [posting(4, [3])]
    correct_map['enlighten'] = [posting(4, [4])]
    correct_map['my'] = [posting(4, [5])]
    correct_map['waifu'] = [posting(4, [6])]
    correct_map['told'] = [posting(4, [7])]
    correct_map['actual'] = [posting(4, [11])]
    correct_map['okay'] = [posting(4, [12])]
    correct_map['jesus'] = [posting(5, [0])]
    correct_map['take'] = [posting(5, [1])]
    correct_map['wheel'] = [posting(5, [3])]
    correct_map['or'] = [posting(5, [4])]
    correct_map['els'] = [posting(5, [5])]
    correct_map['asian'] = [posting(5, [6])]
    correct_map['driver'] = [posting(5, [7])]
    correct_map['survivor'] = [posting(5, [9])]
    correct_map['dont'] = [posting(5, [10])]
    correct_map['let'] = [posting(5, [11])]
    correct_map['happen'] = [posting(5, [13])]

    for keys in txt_index.get_index():
        assert keys in correct_map
Example #23
0
class RecommendationModule:
    data_list = None
    data_n = None
    cleaned_data_list = []

    vocab = set()

    stop_words = set()  # set of stop words
    stemmer = Porter2Stemmer()  # init porter stemmer

    config_path = None
    config = None

    conn = None

    def __init__(self):
        """
        init config, stop words, data list, database
        """
        self.config_path = 'config.ini'
        self.config = configparser.ConfigParser()
        self.config.read(self.config_path, 'utf-8')

        with open(self.config['DEFAULT']['STOPWORDS_PATH'],
                  encoding='utf-8') as f:
            self.stop_words = set(f.read().split())

        self.conn = sqlite3.connect(self.config['DEFAULT']['SE_DB_PATH'])
        self.conn.row_factory = sqlite3.Row
        c = self.conn.cursor()
        self.data_list = c.execute('select * from recipes').fetchall()
        self.data_n = len(self.data_list)

        c.execute('drop table if exists k_nearest')
        c.execute(
            'create table k_nearest (id integer primary key, '
            'nn1 integer, nn2 integer, nn3 integer, nn4 integer, nn5 integer)')
        self.conn.commit()

    def __del__(self):
        """
        close the database
        :return:
        """
        self.conn.close()

    def get_list_maxnum_index(self, num_list, top):
        """
        get the index of the maximum number in the list
        :param num_list:
        :param top:
        :return:
        """
        num_dict = {}
        for i in range(len(num_list)):
            num_dict[i] = num_list[i]
        res_list = sorted(num_dict.items(), key=lambda e: e[1])
        max_num_index = [one[0] for one in res_list[::-1][:top]]

        return list(max_num_index)

    def data_cleanup_tf(self, data):
        """
        clean data and construct tf dictionary
        :param data:
        :return: length of data and tf dictionary
        """
        tf_dict = {}  # {term: tf, ...}
        n = 0  # length of data

        terms = data.lower().split()  # lower the data and split
        for term in terms:
            # filter stop words having quotation marks
            # filter sites in a simple way
            if (term not in self.stop_words) and ('http' not in term) and (
                    'www' not in term):
                term = re.sub(r'[^a-z]', '',
                              term)  # remove non-alphabetic letters
                # filter stop words again and blank term
                if (term not in self.stop_words) and (len(term) != 0):
                    term = self.stemmer.stem(term)  # stemming
                    n += 1
                    if term in tf_dict:
                        tf_dict[term] += 1
                    else:
                        tf_dict[term] = 1
        return n, tf_dict

    def construct_data_vocab(self):
        """
        construct vocabulary with only title
        :return:
        """
        for recipe in self.data_list:
            name = recipe['name']
            # ingredients = recipe['ingredients']

            term_tf = self.data_cleanup_tf(name)[1]

            for term in term_tf.keys():
                self.vocab.add(term)

            self.cleaned_data_list.append(list(term_tf.keys()))

    def write_row_to_db(self, rid_self, rid_list):
        """
        write each row into database
        :param rid_cur:
        :param rid_list:
        :return:
        """
        c = self.conn.cursor()

        values = (rid_self, rid_list[0], rid_list[1], rid_list[2], rid_list[3],
                  rid_list[4])
        c.execute('insert into k_nearest values (?, ?, ?, ?, ?, ?)', values)

        self.conn.commit()

    def construct_k_nearest(self):
        """
        construct the k nearest rid
        :return:
        """
        word2id = {}
        for word_id, word in enumerate(self.vocab):
            word2id[word] = word_id

        row2rid = {}  # convert the row id to the recipe id

        matrix_size = (self.data_n, len(word2id))
        X = dok_matrix(matrix_size)

        for i, recipe in enumerate(self.data_list):
            rid = recipe['id']
            name = recipe['name']

            row2rid[i] = rid

            term_tf = self.data_cleanup_tf(name)[1]
            for term, tf in term_tf.items():
                X[i, word2id[term]] = tf

        knn = NearestNeighbors(n_neighbors=6).fit(X)
        for row, x in enumerate(self.cleaned_data_list):
            x_in = dok_matrix((1, len(word2id)))
            for term in x:
                x_in[0, word2id[term]] += 1
                # print(word2id[term])
                # print(x_in)

            neighbours = knn.kneighbors(x_in, 6, return_distance=False)[0]
            rid_self = row2rid[row]
            rid_list = list(
                set([row2rid[row] for row in neighbours]) - set([rid_self]))
            # print(neighbours)
            # print([row2rid[row] for row in neighbours])
            # print(rid_self)
            # print(rid_list)
            self.write_row_to_db(rid_self, rid_list)

        # dictionary = corpora.Dictionary(cleaned_data_list)  # generate the dictionary
        # corpus = [dictionary.doc2bow(item) for item in cleaned_data_list]
        # tfidf = models.TfidfModel(corpus)
        # num_features = len(dictionary.token2id.keys())  # number of terms in the dictionary
        # index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=num_features)

        # for i, data in enumerate(cleaned_data_list):
        #     vector = dictionary.doc2bow(data)  # convert to svm
        #     sims = index[tfidf[vector]]
        #     row_list = self.get_list_maxnum_index(list(sims), 6)
        #     rid_list = [row2rid[row] for row in row_list]
        #     self.write_row_to_db(row2rid[i], rid_list.remove(row2rid[i])[:5])

    def find_k_nearest(self):
        """
        find the k nearest rid and write to database
        :return:
        """
        self.construct_data_vocab()
        self.construct_k_nearest()
Example #24
0
def doStem(word):
    stemmer = Porter2Stemmer()
    return stemmer.stem(word)
Example #25
0
truncate_similarities()
# In case the data is read from the database then the following is required.
# article_master = import_content()

# LOCAL IMPORT
article_master = pd.read_csv(os.path.abspath("./data/content_metadata.csv"))

## PREPROCESS CONTENT
print("Previous Model Truncated.")
print("Pre-processing....")

# REDUCE CONTENT:
article_master['reduced_content'] = article_master.apply \
    (lambda row: re.sub('[^a-z\s]', '', filter_html(row.bodytext).lower()), axis=1)

snowball = Porter2Stemmer()

article_master['stemmed_content'] = article_master.apply \
    (lambda row: text_stemmer(row.reduced_content, snowball), axis=1)

article_master['stemmed_content'] = article_master['stemmed_content'].fillna(
    '')

# REDUCE TITLE:
# It must be noted that numbers are removed from the content and not from the title
article_master['reduced_title'] = article_master.apply \
    (lambda row: re.sub('[^a-z0-9\s]', '', row.title.lower()), axis=1)

article_master['stemmed_title'] = article_master.apply \
    (lambda row: text_stemmer(row.reduced_title, snowball), axis=1)
Example #26
0
class Index_name(dbbase):
    __table__ = Table('index_name', md, autoload=True)

    stop_words = get_stopwords()
    stemmer = Porter2Stemmer()

    DATA_N = 163249
    AVG_LEN = 33.782259003117936

    def fetch_from_db(self, term):
        """
        fetch the corresponding index from database
        :param term:
        :param table_name:
        :return:
        """
        row = dbsession.query(Index_name).filter_by(term=term).first()
        return row

    def data_cleanup_tf(self, data):
        """
        clean data and construct tf dictionary
        :param data:
        :return: length of data and tf dictionary
        """
        tf_dict = {}  # {term: tf, ...}
        n = 0  # length of data

        terms = data.lower().split()  # lower the data and split
        for term in terms:
            # filter stop words having quotation marks
            # filter sites in a simple way
            if (term not in self.stop_words) and ('http' not in term) and ('www' not in term):
                term = re.sub(r'[^a-z]', '', term)  # remove non-alphabetic letters
                # filter stop words again and blank term
                if (term not in self.stop_words) and (len(term) != 0):
                    term = self.stemmer.stem(term)  # stemming
                    n += 1
                    if term in tf_dict:
                        tf_dict[term] += 1
                    else:
                        tf_dict[term] = 1
        return n, tf_dict

    def result_by_tfidf(self, query):
        """
        query by tfidf, for only title
        :param query:
        :return:
        """
        n, tf_dict = self.data_cleanup_tf(query)

        tfidf_scores = {}
        for term in tf_dict.keys():
            r = self.fetch_from_db(term)
            if r is None:
                continue
            df = r.df
            idf = math.log(self.DATA_N / df)

            posting_list = r.postings.split('\n')
            for posting in posting_list:
                rid, tf, length = posting.split('\t')
                rid = int(rid)
                tf = int(tf)
                s = (1 + math.log(tf)) * idf * tf_dict[term]
                if rid in tfidf_scores:
                    tfidf_scores[rid] = tfidf_scores[rid] + s
                else:
                    tfidf_scores[rid] = s

        tfidf_scores = sorted(tfidf_scores.items(), key=operator.itemgetter(1))
        tfidf_scores.reverse()

        result = [x[0] for x in tfidf_scores]
        # print(len(tfidf_scores), len(result))
        if len(result) == 0:
            return 0, []
        else:
            return 1, result
Example #27
0
 def stemming_tokenizer(self, text):
     stemmer = Porter2Stemmer()
     return [stemmer.stem(w) for w in word_tokenize(text)]