Ejemplo n.º 1
0
from __future__ import division
import nltk
import string
import sys
from terminal_colors import Tcolors
from porter2stemmer import Porter2Stemmer
stemmer = Porter2Stemmer()
sys.path.append(sys.path[0] + "/../")


class PolarityClassifier:
    """
    PolarityClassifier: Rule-based polarity classification of sentences 
    according to the following paper:

    T. Wilson, J. Wiebe, and P. Hoffmann. Recognizing contextual polarity 
    in phrase-level sentiment analysis. In Proceedings of the conference 
    on Human Language Technology and Empirical Methods in Natural Language 
    Processing, HLT '05, pages 347--354, 2005.

    Enhancements: We have incorporated emoticons and slung dictionary 
    apart from the MPQA lexicon that is used in the paper.
    """
    def __init__(self, tagger, lexicon, debug=False):
        self.lexicon = lexicon.words
        self.sentence = None
        self.words = []
        self.feature_words = {}
        self.polar_expressions = []
        self.polar_with_tags = {}
        self.polar_with_score = {}
Ejemplo n.º 2
0
class IndexModule:
    data_list = None
    data_n = None

    stop_words = set()  # set of stop words
    stemmer = Porter2Stemmer()  # init porter stemmer

    config_path = None
    config = None

    conn = None

    def __init__(self):
        """
        init config, stop words, data list
        """
        self.config_path = 'config.ini'
        self.config = configparser.ConfigParser()
        self.config.read(self.config_path, 'utf-8')

        with open(self.config['DEFAULT']['STOPWORDS_PATH'],
                  encoding='utf-8') as f:
            self.stop_words = set(f.read().split())

        self.conn = sqlite3.connect(self.config['DEFAULT']['SE_DB_PATH'])
        self.conn.row_factory = sqlite3.Row
        c = self.conn.cursor()
        self.data_list = c.execute('select * from recipes').fetchall()
        self.data_n = len(self.data_list)

        # write DATA_N to config
        self.config.set('DEFAULT', 'DATA_N', str(self.data_n))
        with open(self.config_path, 'w', encoding='utf-8') as f:
            self.config.write(f)

    def __del__(self):
        """
        close the database
        :return:
        """
        self.conn.close()

    def write_index_to_db(self, index, table_name):
        """
        write inverted index to db
        index in form of # form: {term: [df, [posting, ...]], ...}
        :param table_name:
        :param index:
        :return:
        """
        conn = sqlite3.connect(self.config['DEFAULT']['SE_DB_PATH'])
        c = conn.cursor()

        c.execute('drop table if exists %s' % table_name)
        c.execute(
            'create table %s (term text primary key, df integer, postings text)'
            % table_name)

        for key, value in index.items():
            posting_list = '\n'.join(map(str, value[1]))
            values = (key, value[0], posting_list)
            c.execute('insert into %s values (?, ?, ?)' % table_name, values)

        conn.commit()
        conn.close()

    def data_cleanup_tf(self, data):
        """
        clean data and construct tf dictionary
        :param data:
        :return: length of data and tf dictionary
        """
        tf_dict = {}  # {term: tf, ...}
        n = 0  # length of data

        terms = data.lower().split()  # lower the data and split
        for term in terms:
            # filter stop words having quotation marks
            # filter sites in a simple way
            if (term not in self.stop_words) and ('http' not in term) and (
                    'www' not in term):
                term = re.sub(r'[^a-z]', '',
                              term)  # remove non-alphabetic letters
                # filter stop words again and blank term
                if (term not in self.stop_words) and (len(term) != 0):
                    term = self.stemmer.stem(term)  # stemming
                    n += 1
                    if term in tf_dict:
                        tf_dict[term] += 1
                    else:
                        tf_dict[term] = 1
        return n, tf_dict

    def construct_index_name_desc_ing(self):
        """
        construct inverted index with name and description
        :return:
        """
        inverted_index = {}  # form: {term: [df, [posting, ...]], ...}
        AVG_LEN = 0  # average length for name and description

        for recipe in self.data_list:
            rid = recipe['id']
            name = recipe['name']
            description = recipe['description']
            ingredients = recipe['ingredients']

            length, term_tf = self.data_cleanup_tf(name + ' ' + description +
                                                   ' ' + ingredients)
            AVG_LEN += length
            for term, tf in term_tf.items():
                posting = Posting(rid, tf, length)
                if term in inverted_index:
                    inverted_index[term][0] += 1  # df++
                    inverted_index[term][1].append(
                        posting)  # add posting to list
                else:
                    inverted_index[term] = [1, [posting]]  # [df, [posting]]

        AVG_LEN /= self.data_n
        # print(len(inverted_index), inverted_index)

        # write AVG_LEN to config
        self.config.set('DEFAULT', 'AVG_LEN', str(AVG_LEN))
        with open(self.config_path, 'w', encoding='utf-8') as f:
            self.config.write(f)

        self.write_index_to_db(inverted_index, 'index_name_desc_ing')

    def construct_index_name(self):
        """
        construct inverted index with name
        :return:
        """
        inverted_index = {}  # form: {term: [df, [posting, ...]], ...}

        for recipe in self.data_list:
            rid = recipe['id']
            name = recipe['name']

            length, term_tf = self.data_cleanup_tf(name)
            for term, tf in term_tf.items():
                posting = Posting(rid, tf, length)
                if term in inverted_index:
                    inverted_index[term][0] += 1  # df++
                    inverted_index[term][1].append(
                        posting)  # add posting to list
                else:
                    inverted_index[term] = [1, [posting]]  # [df, [posting]]
        # print(len(inverted_index), inverted_index)

        self.write_index_to_db(inverted_index, 'index_name')

    def construct_index_ingredient(self):
        """
        construct inverted index with ingredient
        :return:
        """
        inverted_index = {}  # form: {term: [df, [posting, ...]], ...}

        for recipe in self.data_list:
            rid = recipe['id']
            ing = recipe['ingredients']

            length, term_tf = self.data_cleanup_tf(ing)
            for term, tf in term_tf.items():
                posting = Posting(rid, tf, length)
                if term in inverted_index:
                    inverted_index[term][0] += 1  # df++
                    inverted_index[term][1].append(
                        posting)  # add posting to list
                else:
                    inverted_index[term] = [1, [posting]]  # [df, [posting]]

        self.write_index_to_db(inverted_index, 'index_ingredient')
Ejemplo n.º 3
0
class RecommendationModule:
    data_list = None
    data_n = None
    cleaned_data_list = []

    vocab = set()

    stop_words = set()  # set of stop words
    stemmer = Porter2Stemmer()  # init porter stemmer

    config_path = None
    config = None

    conn = None

    def __init__(self):
        """
        init config, stop words, data list, database
        """
        self.config_path = 'config.ini'
        self.config = configparser.ConfigParser()
        self.config.read(self.config_path, 'utf-8')

        with open(self.config['DEFAULT']['STOPWORDS_PATH'],
                  encoding='utf-8') as f:
            self.stop_words = set(f.read().split())

        self.conn = sqlite3.connect(self.config['DEFAULT']['SE_DB_PATH'])
        self.conn.row_factory = sqlite3.Row
        c = self.conn.cursor()
        self.data_list = c.execute('select * from recipes').fetchall()
        self.data_n = len(self.data_list)

        c.execute('drop table if exists k_nearest')
        c.execute(
            'create table k_nearest (id integer primary key, '
            'nn1 integer, nn2 integer, nn3 integer, nn4 integer, nn5 integer)')
        self.conn.commit()

    def __del__(self):
        """
        close the database
        :return:
        """
        self.conn.close()

    def get_list_maxnum_index(self, num_list, top):
        """
        get the index of the maximum number in the list
        :param num_list:
        :param top:
        :return:
        """
        num_dict = {}
        for i in range(len(num_list)):
            num_dict[i] = num_list[i]
        res_list = sorted(num_dict.items(), key=lambda e: e[1])
        max_num_index = [one[0] for one in res_list[::-1][:top]]

        return list(max_num_index)

    def data_cleanup_tf(self, data):
        """
        clean data and construct tf dictionary
        :param data:
        :return: length of data and tf dictionary
        """
        tf_dict = {}  # {term: tf, ...}
        n = 0  # length of data

        terms = data.lower().split()  # lower the data and split
        for term in terms:
            # filter stop words having quotation marks
            # filter sites in a simple way
            if (term not in self.stop_words) and ('http' not in term) and (
                    'www' not in term):
                term = re.sub(r'[^a-z]', '',
                              term)  # remove non-alphabetic letters
                # filter stop words again and blank term
                if (term not in self.stop_words) and (len(term) != 0):
                    term = self.stemmer.stem(term)  # stemming
                    n += 1
                    if term in tf_dict:
                        tf_dict[term] += 1
                    else:
                        tf_dict[term] = 1
        return n, tf_dict

    def construct_data_vocab(self):
        """
        construct vocabulary with only title
        :return:
        """
        for recipe in self.data_list:
            name = recipe['name']
            # ingredients = recipe['ingredients']

            term_tf = self.data_cleanup_tf(name)[1]

            for term in term_tf.keys():
                self.vocab.add(term)

            self.cleaned_data_list.append(list(term_tf.keys()))

    def write_row_to_db(self, rid_self, rid_list):
        """
        write each row into database
        :param rid_cur:
        :param rid_list:
        :return:
        """
        c = self.conn.cursor()

        values = (rid_self, rid_list[0], rid_list[1], rid_list[2], rid_list[3],
                  rid_list[4])
        c.execute('insert into k_nearest values (?, ?, ?, ?, ?, ?)', values)

        self.conn.commit()

    def construct_k_nearest(self):
        """
        construct the k nearest rid
        :return:
        """
        word2id = {}
        for word_id, word in enumerate(self.vocab):
            word2id[word] = word_id

        row2rid = {}  # convert the row id to the recipe id

        matrix_size = (self.data_n, len(word2id))
        X = dok_matrix(matrix_size)

        for i, recipe in enumerate(self.data_list):
            rid = recipe['id']
            name = recipe['name']

            row2rid[i] = rid

            term_tf = self.data_cleanup_tf(name)[1]
            for term, tf in term_tf.items():
                X[i, word2id[term]] = tf

        knn = NearestNeighbors(n_neighbors=6).fit(X)
        for row, x in enumerate(self.cleaned_data_list):
            x_in = dok_matrix((1, len(word2id)))
            for term in x:
                x_in[0, word2id[term]] += 1
                # print(word2id[term])
                # print(x_in)

            neighbours = knn.kneighbors(x_in, 6, return_distance=False)[0]
            rid_self = row2rid[row]
            rid_list = list(
                set([row2rid[row] for row in neighbours]) - set([rid_self]))
            # print(neighbours)
            # print([row2rid[row] for row in neighbours])
            # print(rid_self)
            # print(rid_list)
            self.write_row_to_db(rid_self, rid_list)

        # dictionary = corpora.Dictionary(cleaned_data_list)  # generate the dictionary
        # corpus = [dictionary.doc2bow(item) for item in cleaned_data_list]
        # tfidf = models.TfidfModel(corpus)
        # num_features = len(dictionary.token2id.keys())  # number of terms in the dictionary
        # index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=num_features)

        # for i, data in enumerate(cleaned_data_list):
        #     vector = dictionary.doc2bow(data)  # convert to svm
        #     sims = index[tfidf[vector]]
        #     row_list = self.get_list_maxnum_index(list(sims), 6)
        #     rid_list = [row2rid[row] for row in row_list]
        #     self.write_row_to_db(row2rid[i], rid_list.remove(row2rid[i])[:5])

    def find_k_nearest(self):
        """
        find the k nearest rid and write to database
        :return:
        """
        self.construct_data_vocab()
        self.construct_k_nearest()
Ejemplo n.º 4
0
class crToPG(object):

    stemmer = Porter2Stemmer()

    #remove stop words
    def remove_stop_words(self, frequency_list):

        stop_words = []
        #stop_words = get_stop_words('en')
        with open("stop_words_list.txt") as stop_words_list:
            #stop_words = [line.split(',') for line in stop_words_list if line.strip()]
            stop_words_data = stop_words_list.read()
            stop_words = stop_words_data.split(',')
        #frequency_list=frequency_list.split(" ")

        temp_list = []
        stemmer = Porter2Stemmer()
        #print(frequency_list)
        #stop_words.append(stop_words_data)
        #print(stop_words)
        try:
            '''for key in frequency_list:
				#print(key)
        			if key.lower() not in stop_words:
					
    							#print (key)
							
							#key = self.remove_custom_stop_words(key)
							temp_list.append([key])
							
				else:
					print("Removing:"+str(key))'''
            #frequency_list = ' '.join(e for e in frequency_list if e.isalnum())
            for c in string.punctuation:
                frequency_list = frequency_list.replace(c, "")
            #print(frequency_list)
            querywords = frequency_list.split()
            #querywords = ' '.join(e for e in frequency_list if e.isalnum())
            #print(querywords)

            #for stemwords in querywords:
            #print(stemwords,stemmer.stem(stemwords))

            resultwords = [
                stemmer.stem(word) for word in querywords
                if word.lower() not in stop_words
            ]

            result = ' '.join(resultwords)

            #print result

        except ValueError:
            print "no such value"

        #print(result)
        return result

    def ngrams(self, inputtext, n):
        inputtext = inputtext.split()
        output = {}
        for i in range(len(inputtext) - n + 1):
            #print(inputtext[i:i+n])
            g = ' \n'.join(inputtext[i:i + n])
            output.setdefault(g, 0)
            output[g] += 1
        return output

    def remove_custom_stop_words(self, word_list):

        #stop_words_lst = ['yo', 'so', 'well', 'um', 'a', 'the', 'you know', 'i mean']
        with open("stop_words_list.txt") as stop_words_list:
            stop_words_list = [
                line.split('\n') for line in stop_words_list if line.strip()
            ]
            '''for word in stop_words_list:

    			pattern = r'\b'+word[0]+r'\b'

    			word_list = re.sub(pattern, '', word_list)'''

        try:
            if word_list not in stop_words_list:
                return word_list
            else:
                print("Match found:" + str(word_list))
        except ValueError:
            print "no such value"

    def ingest(self, crfile, pagestack, billstack, speechstack, speechstack1):
        """
        Break a crdoc into three parts
        Pass the appropriate rows for each part
        to the right stack for a bulk insert.
        """
        #print(crfile)
        page_row = OrderedDict([('pageid', crfile['id']),
                                ('title', rd(crfile['doc_title'])),
                                ('chamber', crfile['header']['chamber']),
                                ('extension', crfile['header']['extension']),
                                ('cr_day', crfile['header']['day']),
                                ('cr_month', crfile['header']['month']),
                                ('cr_year', crfile['header']['year']),
                                ('num', crfile['header']['num']),
                                ('vol', crfile['header']['vol']),
                                ('wkday', crfile['header']['wkday'])])
        # Add the "page" level to the page stack first
        pagestack.add(page_row)

        bills = []
        if 'related_bills' in list(crfile.keys()):
            for bill in crfile['related_bills']:
                bill_row = OrderedDict([('congress', bill['congress']),
                                        ('context', bill['context']),
                                        ('bill_type', bill['type']),
                                        ('bill_no', bill['number']),
                                        ('pageid', crfile['id'])])
                bills.append(bill_row)

        # Bills for the bill god!
        billstack.add(bills)

        #speeches = []
        ''' for speech in crfile['content']:
            if speech['kind'] == 'speech':
                speechid = crfile['id'] + '-' + str(speech['turn'])
		test = 'test string'
                speech_row = OrderedDict([('speechid',speechid),
                              ('speaker',speech['speaker']),
                              ('speaker_bioguide',speech['speaker_bioguide']),
                              ('pageid',crfile['id']),
                              ('text',rd(speech['text'])),
                              ('turn',speech['turn']),
			      ('party',test)
                             ]) # Gotta get rid of delimiter char
                speeches.append(speech_row)'''
        speeches_republican = []
        speeches_democratic = []
        #speech_row_D =[]
        #speech_row_R =[]
        democratic_data_output = ''
        republican_data_output = ''

        for speech in crfile['content']:

            if speech['kind'] == 'speech':
                #speechid = crfile['id'] + '-' + str(speech['turn'])
                #test = 'anannya'
                #print(speech)

                #print(speech['speaker_bioguide'])
                #print(rd(speech['text']))

                v = str(speech['speaker_bioguide']) + "||" + str(
                    rd(speech['text']))
                '''with open('speeches_test','a+') as out_json:
                    json.dump(v,out_json)'''
                if speech['speaker_bioguide']:
                    keybioguideid = speech['speaker_bioguide']
                    outpath = os.path.join('', 'json', keybioguideid + '.json')
                    #print(outpath)
                    #outpath = 'json\\'+keybioguideid+'.json'
                    with open(outpath) as json_data:
                        d = json.load(json_data)
                    #print(rd(speech['text']))
                    #print('*****************************************************************************************************************************************')

                    #print(speech_remove_sort_words)
                    if d['party'] == 'D':
                        speech_row_D = []

                        speech_remove_stop_words = []
                        speech_remove_stop_words = self.remove_stop_words(
                            rd(speech['text']))
                        #print(d['party'])
                        #print(speech_remove_stop_words)

                        current_speaker_data = '\n' + speech_remove_stop_words
                        democratic_data_output = democratic_data_output + current_speaker_data + '\n'
                        #print(democratic_data_output)
                        '''with open('democratic_speeches.txt','a+') as out_json:
                    			out_json.write(democratic_data_output)'''

                        speech_row_D = OrderedDict([
                            #('speechid',speechid),
                            ('affiliation', 'Affiliation:' + d['party']),
                            ('speaker', speech['speaker']),
                            #('speaker_bioguide',speech['speaker_bioguide']),
                            #('pageid',crfile['id']),
                            ('text', speech_remove_stop_words),
                            #('turn',speech['turn'])
                        ])
                        speeches_democratic.append(speech_row_D)
                        '''if len(speech_remove_stop_words):
					#print(speech_remove_stop_words)
					speeches_democratic.append(speech_row_D)
				else:
					pass
				#print(str(keybioguideid) + "D")'''

                    elif d['party'] == 'R':
                        speech_row_D = []
                        speech_remove_stop_words = []
                        speech_remove_stop_words = self.remove_stop_words(
                            rd(speech['text']))

                        #print(speech_remove_stop_words,stemmer.stem(speech_remove_stop_words))

                        current_speaker_data = '\n' + speech_remove_stop_words
                        republican_data_output = republican_data_output + current_speaker_data + '\n'
                        '''with open('republican_speeches.txt','a+') as out_json:
                    			out_json.write(republican_data_output)'''
                        speech_row_R = OrderedDict([
                            #('speechid',speechid),
                            ('affiliation', 'Affiliation:' + d['party']),
                            ('speaker', speech['speaker']),
                            #('speaker_bioguide',speech['speaker_bioguide']),
                            #('pageid',crfile['id']),
                            #('text',''),
                            ('text', speech_remove_stop_words),
                            #('turn',speech['turn'])
                        ])
                        speeches_republican.append(speech_row_R)
                        #print(str(keybioguideid) + "R")
                        '''if len(speech_remove_stop_words):
					#print(speech_remove_stop_words)
			      		speeches_republican.append(speech_row_R)
			      else:
					pass'''

                else:
                    keybioguideid = 'dummy'
                    #print(str(keybioguideid))

                #pr.find_people(pr(),'','')

                # SPEECHES FOR THE SPEECH THRONE
                #print(speeches_republican)
                #print(speeches_democratic)
                #print(democratic_data_output)

                speechstack.add(speeches_republican)
                speechstack1.add(speeches_democratic)

#print(democratic_data_output)
        '''
	line = ""
	open_file = democratic_data_output
	for val in open_file:
    		line += val.lower()
	tokens = line.split()

	bigram_measures = nltk.collocations.BigramAssocMeasures()
	finder = BigramCollocationFinder.from_words(tokens)
	finder.apply_freq_filter(1)
	a = finder.ngram_fd.viewitems()
	democratic_speeches_2_grams=''
	for i, j in a:
  		#print("{0} {1} {2}".format(i[0], i[1], j))
		democratic_speeches_2_grams = str("{0} {1} {2}".format(i[0], i[1],': '+ str(j))) + '\n                                     '
		
		with open('democratic_speeches_2_grams.txt','a+') as out_json:
		    #print(democratic_speeches_2_grams)
                    out_json.write(str(democratic_speeches_2_grams) + '\n')'''
        '''democratic_speeches_2_grams = str(self.ngrams(democratic_data_output,2)) + "\n"
	#print(self.ngrams(democratic_data_output,2))
	with open('democratic_speeches_2_grams.txt','a+') as out_json:
                    out_json.write(democratic_speeches_2_grams)'''

        #republican_speeches_2_grams = str(self.ngrams(republican_data_output,2)) + "\n"
        '''
	repline = ""
	open_file_rep = republican_data_output
	for val in open_file_rep:
    		repline += val.lower()
	tokens = repline.split()

	bigram_measures = nltk.collocations.BigramAssocMeasures()
	finder = BigramCollocationFinder.from_words(tokens)
	finder.apply_freq_filter(1)
	a = finder.ngram_fd.viewitems()
	republican_speeches_2_grams = ''
	for i, j in a:
  		#print("{0} {1} {2}".format(i[0], i[1], j))
		republican_speeches_2_grams = str("{0} {1} {2}".format(i[0], i[1], ': '+ str(j))) + '\n                                     '

	with open('republican_speeches_2_grams.txt','a+') as out_json:
		    #print(republican_speeches_2_grams)
                    out_json.write(str(republican_speeches_2_grams) + '\n')'''
    def find_people(self):
        mbrs = self.doc_ref.find_all('congmember')
        if mbrs:
            for mbr in mbrs:
                self.speakers[mbr.find('name',
                                     {'type':'parsed'}).string] = \
                                     self.people_helper(mbr)

    '''def people_helper(self,tagobject):
        output_dict = {}
        if 'bioguideid' in tagobject.attrs:
            output_dict['bioguideid'] = tagobject['bioguideid']
        elif 'bioGuideId' in tagobject.attrs:
            output_dict['bioguideid'] = tagobject['bioGuideId']
        else:
            output_dict['bioguideid'] = 'None'
        for key in ['chamber','congress','party','state','role']:
            if key in tagobject.attrs:
                output_dict[key] = tagobject[key]
            else:
                output_dict[key] = 'None'
        try:
            output_dict['name_full'] = tagobject.find('name',{'type':'authority-fnf'}).string
        except:
            output_dict['name_full'] = 'None'
	#print(output_dict)
        return output_dict

    # Flow control for metadata generation
    def gen_file_metadata(self):
        # Sometimes the searchtitle has semicolons in it so .split(';') is a nogo
        temp_ref = self.cr_dir.mods.find('accessid', text=self.access_path)
        if temp_ref is None:
            raise RuntimeError("{} doesn't have accessid tag".format(self.access_path))
        self.doc_ref = temp_ref.parent
        matchobj = re.match(self.re_vol, self.doc_ref.searchtitle.string)
        if matchobj:
            self.doc_title, self.cr_vol, self.cr_num = matchobj.group('title','vol','num')
        else:
            logging.warn('{0} yields no title, vol, num'.format(
                self.access_path))
            self.doc_title, self.cr_vol, self.cr_num = \
              'None','Unknown','Unknown'
        self.find_people()
        self.find_related_bills()
        self.find_related_laws()
        self.find_related_usc()
        self.find_related_statute()
        self.date_from_entry()
        self.chamber = self.doc_ref.granuleclass.string
        self.re_newspeaker = self.make_re_newspeaker()
        self.item_types['speech']['patterns'] = [self.re_newspeaker]'''

    def __init__(self, start, **kwargs):
        """
        BE SURE TO INCLUDE do_mode='yield' in kwargs!
        This object handles flow control for new data
        entering a Postgres database using congressionalrecord2s
        data model.

        It breaks the incoming Python dictionaries into three stacks
        of rows, one for each table in this data model.

        It writes the results to each of three flatfiles suitable for
        a bulk update through COPY.

        This is the way to minimize the number
        of transactions to the database, which we want.
        """
        kwargs['do_mode'] = 'yield'
        if 'csvpath' in kwargs:
            pass
        else:
            kwargs['csvpath'] = 'dbfiles'
        pagepath, billpath, speechpath, speechpath1 = [
            os.path.join(kwargs['csvpath'], filename) for filename in
            ['pages.csv', 'bills.csv', 'speeches_R.csv', 'speeches_D.csv']
        ]
        self.downloader = dl(start, **kwargs)
        self.doc_ref = ''
        memberlistfinal = []
        #object1 = congressionalrecord.fdsys.cr_parser.ParseCRDir()
        #print(object1)
        #self.cr_dir = '<congressionalrecord.fdsys.cr_parser.ParseCRDir object at 0x7f0c7c88cb90>'
        #self.cr_dir=cr_dir
        #self.gen_file_metadata()
        #print(pr.find_people(pr(self,'')))
        #self.find_people()
        #print('anannya'+str(pr.memberlist))
        #print(pr('/home/anannyadas/Desktop/congress/congressional-record-master/congressionalrecord/pg_run/fdsys'))
        self.page_fields = [
            'pageid', 'title', 'chamber', 'extension', 'cr_day', 'cr_month',
            'cr_year', 'num', 'vol', 'pages', 'wkday'
        ]
        self.bill_fields = [
            'congress', 'context', 'bill_type', 'bill_no', 'pageid'
        ]
        #self.speech_fields = ['speechid','affiliation','speaker','speaker_bioguide','pageid','text','turn']
        self.speech_fields = ['affiliation', 'speaker', 'text']
        pagestack = crPages(pagepath, self.page_fields)
        billstack = crBills(billpath, self.bill_fields)
        speechstack = crSpeeches(speechpath, self.speech_fields)
        speechstack1 = crSpeeches(speechpath1, self.speech_fields)
        for crfile in self.downloader.yielded:
            #print(crfile)
            doc = crfile.crdoc
            self.ingest(doc, pagestack, billstack, speechstack, speechstack1)
            # pagestack.write()
            # billstack.write()
            speechstack.write()
            speechstack1.write()
        '''with open('democratic_speeches.txt','r+') as json_data:
    		open_file = json.load(json_data)
		print(open_file)'''
        '''open_file = open('democratic_speeches.txt','r+').read() 
	#print(open_file)
	line = ""

	for val in open_file:
    		line += val.lower()
	tokens = line.split()

	bigram_measures = nltk.collocations.BigramAssocMeasures()
	finder = BigramCollocationFinder.from_words(tokens)
	finder.apply_freq_filter(1)
	a = finder.ngram_fd.viewitems()
	democratic_speeches_2_grams=''
	print(sorted(a, key=lambda x: x[1]))
	#print(a)
	for i, j in a:
  		#print("{0} {1} {2}".format(i[0], i[1], j))
		democratic_speeches_2_grams = str("{0} {1} {2}".format(i[0], i[1],': '+ str(j))) + '\n                                     '
		#democratic_speeches_2_grams = "{0} {1} {2}".format(i[0], i[1],j)
		#democratic_speeches_2_grams = democratic_speeches_2_grams.update({str(i[0])+" "+str(i[1]),j}) 
		
		#sorted(democratic_speeches_2_grams, key=lambda x: x[1])
		#print(democratic_speeches_2_grams)
		#democratic_speeches_2_grams_set = set()

		#for details in democratic_speeches_2_grams:
     			#democratic_speeches_2_grams_set.add(details)
		with open('democratic_speeches_2_grams.txt','a+') as out_json:
		    #print(democratic_speeches_2_grams_set)
                    out_json.write(str(democratic_speeches_2_grams))'''
        '''
Ejemplo n.º 5
0
def test_index_txt_file():
    txt_index = positional_inverted_index()
    stemmer = Porter2Stemmer()
    k = kgram_index()

    file_names = []
    documentID = 1

    # Dealing with punctuation
    p = dict.fromkeys(string.punctuation)
    p.pop('-') # we need to deal with hyphens
    punctuation = str.maketrans(p)

    directory = path.dirname(path.realpath(__file__)) + '/unit_test_docs/'
    chdir(directory)

    for file in listdir(directory):
        if file.endswith('.txt'):
            file_names.append(str(file))

    for file in file_names:
        try:
            with open(file) as txt_file:

                content = txt_file.readlines();
                content = content[0].lower().translate(punctuation).split(' ')
                content = list(filter(lambda w: w != '', map(lambda s: s.strip(), content)))

                positions_dict = {}
                for i in range(0, len(content)):
                    if '-' in content[i]:

                        hyphened_word_parts = content[i].split('-')
                        hyphened_word = content[i].replace('-', '')
                        hyphened_word_parts.append(hyphened_word)

                        for word in hyphened_word_parts:
                            if word in positions_dict:
                                positions_dict[word].append(i)
                            else:
                                positions_dict[word] = [i]
                    else:

                        if content[i] in positions_dict:
                            positions_dict[content[i]].append(i)
                        else:
                            positions_dict[content[i]] = [i]


                for key in positions_dict:
                    txt_index.add_term(stemmer.stem(key), documentID, positions_dict[key])
        except FileNotFoundError as e:
            i = 0
            print(e)

        documentID = documentID + 1

    for key in txt_index.get_index():
        txt_index.print_term_info(key)

    correct_map = {}
    correct_map['today'] = [posting(1, [0]), posting(2, [0]), posting(3, [0])]
    correct_map['i'] = [posting(1, [1, 6, 11]), posting(2, [1]), posting(3, [1]), posting(4, [0])]
    correct_map['fell'] = [posting(1, [2])]
    correct_map['in'] = [posting(1, [3])]
    correct_map['a'] = [posting(1, [4])]
    correct_map['well'] = [posting(1, [5])]
    correct_map['have'] = [posting(1, [7]), posting(4, [1])]
    correct_map['no'] = [posting(1, [8]), posting(5, [8])]
    correct_map['mouth'] = [posting(1, [9])]
    correct_map['but'] = [posting(1, [10])]
    correct_map['want'] = [posting(1, [12])]
    correct_map['to'] = [posting(1, [13])]
    correct_map['scream'] = [posting(1, [14])]
    correct_map['top'] = [posting(2, [2])]
    correct_map['deck'] = [posting(2, [3])]
    correct_map['lethal'] = [posting(2, [4])]
    correct_map['yogg'] = [posting(2, [5])]
    correct_map['saron'] = [posting(2, [5])]
    correct_map['yoggsaron'] = [posting(2, [5])]
    correct_map['f**k'] = [posting(2, [6])]
    correct_map['me'] = [posting(2, [7]), posting(4, [8])]
    correct_map['over'] = [posting(2, [8])]
    correct_map['super'] = [posting(2, [9])]
    correct_map['hard'] = [posting(2, [10])]
    correct_map['learn'] = [posting(3, [2])]
    correct_map['the'] = [posting(3, [3]), posting(5, [2])]
    correct_map['mean'] = [posting(3, [4])]
    correct_map['of'] = [posting(3, [5])]
    correct_map['pain'] = [posting(3, [6])]
    correct_map['it'] = [posting(3, [7]), posting(4, [9]), posting(5, [12])]
    correct_map['was'] = [posting(3, [8]), posting(4, [10])]
    correct_map['all'] = [posting(3, [9])]
    correct_map['caus'] = [posting(3, [10])]
    correct_map['by'] = [posting(3, [11])]
    correct_map['nealdt'] = [posting(3, [12])]
    correct_map['ascend'] = [posting(4, [2])]
    correct_map['into'] = [posting(4, [3])]
    correct_map['enlighten'] = [posting(4, [4])]
    correct_map['my'] = [posting(4, [5])]
    correct_map['waifu'] = [posting(4, [6])]
    correct_map['told'] = [posting(4, [7])]
    correct_map['actual'] = [posting(4, [11])]
    correct_map['okay'] = [posting(4, [12])]
    correct_map['jesus'] = [posting(5, [0])]
    correct_map['take'] = [posting(5, [1])]
    correct_map['wheel'] = [posting(5, [3])]
    correct_map['or'] = [posting(5, [4])]
    correct_map['els'] = [posting(5, [5])]
    correct_map['asian'] = [posting(5, [6])]
    correct_map['driver'] = [posting(5, [7])]
    correct_map['survivor'] = [posting(5, [9])]
    correct_map['dont'] = [posting(5, [10])]
    correct_map['let'] = [posting(5, [11])]
    correct_map['happen'] = [posting(5, [13])]

    for keys in txt_index.get_index():
        assert keys in correct_map
Ejemplo n.º 6
0
class ParseCRFile(object):
    # Some regex
    re_time = r'^CREC-(?P<year>[0-9]{4})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})-.*'
    re_vol = r'^(?P<title>.*); Congressional Record Vol. (?P<vol>[0-9]+), No. (?P<num>[0-9]+)$'
    re_vol_file =   r'^\[Congressional Record Volume (?P<vol>[0-9]+), Number (?P<num>[0-9]+)'\
                    + r' \((?P<wkday>[A-Za-z]+), (?P<month>[A-Za-z]+) (?P<day>[0-9]+), (?P<year>[0-9]{4})\)\]'
    re_chamber =  r'\[(?P<chamber>[A-Za-z\s]+)\]'
    re_pages =  r'\[Page[s]? (?P<pages>[\w\-]+)\]'
    re_trail = r'From the Congressional Record Online'\
      + r' through the Government (Publishing|Printing) Office \[www.gpo.gov\]$'
    re_rollcall =       r'\[Roll(call)?( Vote)? No. \d+.*\]'
    re_recorderstart =  (r'^\s+(?P<start>'
                        + r'(The (assistant )?legislative clerk read as follows)'
                        + r'|(The nomination considered and confirmed is as follows)'
                        + r'|(The (assistant )?legislative clerk)'
                        + r'|(The nomination was confirmed)'
                        + r'|(There being no objection, )'
                        + r'|(The resolution .*?was agreed to.)'
                        + r'|(The preamble was agreed to.)'
                        + r'|(The resolution .*?reads as follows)'
                        + r'|(The assistant editor .*?proceeded to call the roll)'
                        + r'|(The bill clerk proceeded to call the roll.)'
                        + r'|(The bill clerk called the roll.)'
                        + r'|(The motion was agreed to.)'
                        #+ r'|(The Clerk read the resolution, as follows:)'
                        + r'|(The Clerk read (the resolution, )as follows:)'
                        + r'|(The resolution(, with its preamble,)? reads as follows:)'
                        + r'|(The amend(ment|ed).*?(is)? as follows:)'
                        + r'|(Amendment No\. \d+.*?is as follows:)'
                        + r'|(The yeas and nays resulted.*?, as follows:)'
                        + r'|(The yeas and nays were ordered)'
                        + r'|(The result was announced.*?, as follows:)'
                        + r'|(The .*?editor of the Daily Digest)'
                        + r'|(The (assistant )?bill clerk read as follows:)'
                        + r'|(The .*?read as follows:)'
                        + r'|(The text of the.*?is as follows)'
                        + r'|(amended( to read)? as follows:)'
                        + r'|(The material (previously )?referred to (by.*?)?is as follows:)'
                        + r'|(There was no objection)'
                        + r'|(The amendment.*?was agreed to)'
                        + r'|(The motion to table was .*)'
                        + r'|(The question was taken(;|.))'
                        + r'|(The following bills and joint resolutions were introduced.*)'
                        + r'|(The vote was taken by electronic device)'
                        + r'|(A recorded vote was ordered)'
                        #+ r'|()'
                        + r').*')
    # anchored at the end of the line
    re_recorderend =    (r'('
                        + r'(read as follows:)'
                        + r'|(the Record, as follows:)'
                        + r'|(ordered to lie on the table; as follows:)'
                        + r'|(resolutions as follows:)'
                        + r')$')
    # sometimes the recorder says something that is not unique to them but
    # which, in the right context, we take to indicate a recorder comment.
    re_recorder_fuzzy = (r'^\s+(?P<start>'
                        + r'(Pending:)'
                        + r'|(By M(r|s|rs)\. .* \(for .*)'
                        #+ r'|()'
                        + r').*')
    # NCJ's broader version below, tested on one day of the record.
    # works, honest
    re_recorder_ncj = (r'^\s+(?P<start>'
                       + r'(Pending:)'
                       + r'|(By M(r|rs|s|iss)[\.]? [a-zA-Z]+))'
                       )
    re_clerk = r'^\s+(?P<start>The Clerk (read|designated))'
    re_allcaps = r'^ \s*(?!([_=]+|-{3,}))(?P<title>([A-Z]+[^a-z]+))$'
    re_linebreak = r'\s+([_=]+|-{5,})(NOTE|END NOTE)?([_=]+|-{5,})*\s*'
    re_excerpt = r'\s+(_{3,4})'
    re_newpage =   r'\s*\[\[Page \w+\]\]'
    re_timestamp = r'\s+\{time\}\s+\d{4}'

    
    # Metadata-making functions
    def title_id(self):
        id_num = self.num_titles
        self.num_titles += 1
        return id_num
        
    def make_re_newspeaker(self):
        speaker_list = '|'.join([mbr for mbr in list(self.speakers.keys()) \
        if self.speakers[mbr]['role'] == 'SPEAKING'])
        if len(speaker_list) > 0:
            re_speakers = r'^(\s{1,2}|<bullet>)(?P<name>((' + speaker_list + ')|(((Mr)|(Ms)|(Mrs)|(Miss))\. (([-A-Z\'])(\s)?)+( of [A-Z][a-z]+)?)|(((The ((VICE|ACTING|Acting) )?(PRESIDENT|SPEAKER|CHAIR(MAN)?)( pro tempore)?)|(The PRESIDING OFFICER)|(The CLERK)|(The CHIEF JUSTICE)|(The VICE PRESIDENT)|(Mr\. Counsel [A-Z]+))( \([A-Za-z.\- ]+\))?)))\.'
        else:
            re_speakers = r'^(\s{1,2}|<bullet>)(?P<name>((((Mr)|(Ms)|(Mrs)|(Miss))\. (([-A-Z\'])(\s)?)+( of [A-Z][a-z]+)?)|((The ((VICE|ACTING|Acting) )?(PRESIDENT|SPEAKER|CHAIR(MAN)?)( pro tempore)?)|(The PRESIDING OFFICER)|(The CLERK)|(The CHIEF JUSTICE)|(The VICE PRESIDENT)|(Mr\. Counsel [A-Z]+))( \([A-Za-z.\- ]+\))?))\.'
        return re_speakers
    
    def people_helper(self,tagobject):
        output_dict = {}
        if 'bioguideid' in tagobject.attrs:
            output_dict['bioguideid'] = tagobject['bioguideid']
        elif 'bioGuideId' in tagobject.attrs:
            output_dict['bioguideid'] = tagobject['bioGuideId']
        else:
            output_dict['bioguideid'] = 'None'
        for key in ['chamber','congress','party','state','role']:
            if key in tagobject.attrs:
                output_dict[key] = tagobject[key]
            else:
                output_dict[key] = 'None'
        try:
            output_dict['name_full'] = tagobject.find('name',{'type':'authority-fnf'}).string
        except:
            output_dict['name_full'] = 'None'
	#print(output_dict)
	#cr.memberlistfinal.append(output_dict)
	
	''' if 'json' not in os.listdir(outpath):
                    os.mkdir(os.path.join(outpath,'json'))'''
	with open('json/'+output_dict['bioguideid']+'.json','w+') as out_json:
                     json.dump(output_dict,out_json)
        return output_dict
    

        
    def find_people(self):
        mbrs = self.doc_ref.find_all('congmember')
	memberlist = mbrs
	#print(memberlist)
        if mbrs:
            for mbr in mbrs:
                self.speakers[mbr.find('name',
                                       {'type':'parsed'}).string] = \
                                       self.people_helper(mbr)
    
    
    def find_related_bills(self):
        related_bills = self.doc_ref.find_all('bill')
        if len(related_bills) > 0:
            self.crdoc['related_bills'] = \
              [bill.attrs for bill in related_bills]

    def find_related_laws(self):
        related_laws = self.doc_ref.find_all('law')
        if len(related_laws) > 0:
            self.crdoc['related_laws'] = \
              [law.attrs for law in related_laws]

    def find_related_usc(self):
        related_usc = self.doc_ref.find_all('uscode')
        if len(related_usc) > 0:
            self.crdoc['related_usc'] = list(
                itertools.chain.from_iterable(
                    [[dict([('title',usc['title'])] +
                        list(sec.attrs.items())) for sec
                        in usc.find_all('section')]
                        for usc in related_usc]
                    )
                )

    def find_related_statute(self):
        related_statute = self.doc_ref.find_all('statuteatlarge')
        if len(related_statute) > 0:
            self.crdoc['related_statute'] = list(
                itertools.chain.from_iterable(
                    [[dict([('volume',st['volume'])] +
                        list(pg.attrs.items())) for pg
                        in st.find_all('pages')]
                        for st in related_statute]
                    )
                )
        
    def date_from_entry(self):
        year, month, day = re.match(self.re_time,self.access_path).group('year','month','day')
        if self.doc_ref.time:
            from_hr,from_min,from_sec = self.doc_ref.time['from'].split(':')
            to_hr,to_min,to_sec = self.doc_ref.time['to'].split(':')
            try:
                self.doc_date = datetime(int(year),int(month),int(day))
                self.doc_start_time = datetime(int(year),int(month),int(day),\
                int(from_hr),int(from_min),int(from_sec))
                self.doc_stop_time = datetime(int(year),int(month),int(day),\
                int(to_hr),int(to_min),int(to_sec))
                self.doc_duration = self.doc_stop_time - self.doc_start_time
            except:
                logging.info('Could not extract a document timestamp.')
    
    # Flow control for metadata generation
    def gen_file_metadata(self):
        # Sometimes the searchtitle has semicolons in it so .split(';') is a nogo
        temp_ref = self.cr_dir.mods.find('accessid', text=self.access_path)
	
	#print(type(self.cr_dir))

        if temp_ref is None:
            raise RuntimeError("{} doesn't have accessid tag".format(self.access_path))
        self.doc_ref = temp_ref.parent
        matchobj = re.match(self.re_vol, self.doc_ref.searchtitle.string)
        if matchobj:
            self.doc_title, self.cr_vol, self.cr_num = matchobj.group('title','vol','num')
        else:
            logging.warn('{0} yields no title, vol, num'.format(
                self.access_path))
            self.doc_title, self.cr_vol, self.cr_num = \
              'None','Unknown','Unknown'
        self.find_people()
        self.find_related_bills()
        self.find_related_laws()
        self.find_related_usc()
        self.find_related_statute()
        self.date_from_entry()
        self.chamber = self.doc_ref.granuleclass.string
        self.re_newspeaker = self.make_re_newspeaker()
        self.item_types['speech']['patterns'] = [self.re_newspeaker]

    # That's it for metadata. Below deals with content.

    def read_htm_file(self):
        """
        This function updates a self.cur_line
        attribute. So now for each call to the iterator there are two
        pointers to the next line - one for the function,
        and one for the object.

        The purpose of the attribute is to
        give each parsing function a "starting position"
        so that the handshake between functions is easier. Now
        the current (or last) line is tracked in only one place
        and the same way by all object methods.
        """
        self.lines_remaining = True
        with open(self.filepath, 'r') as htm_file:
            htm_lines = htm_file.read()
            htm_text = BeautifulSoup(htm_lines,"lxml")
        text = htm_text.pre.text.split('\n')
        for line in text:
            self.cur_line = line
            yield line
        self.lines_remaining = False
    
    def get_header(self):
        """
        Only after I wrote this did I realize
        how bad things can go when you call
        next() on an iterator instead of treating
        it as a list.

        This code works, though.
        """
        header_in = next(self.the_text)
        if header_in == u'':
            header_in = next(self.the_text)
        match = re.match(self.re_vol_file, header_in)
        if match:
            vol, num, wkday, month, day, year = match.group( \
            'vol','num','wkday','month','day','year')
        else:
            return False
        header_in = next(self.the_text)
        match = re.match(self.re_chamber, header_in)
        if match:
            if match.group('chamber') == 'Extensions of Remarks':
                chamber = 'House'
                extensions = True
            else:
                chamber = match.group('chamber')
                extensions = False
        else:
            return False
        header_in = next(self.the_text)
        match = re.match(self.re_pages, header_in)
        if match:
            pages = match.group('pages')
        else:
            return False
        header_in = next(self.the_text)
        match = re.match(self.re_trail, header_in)
        if match:
            pass
        else:
            return False
        return vol, num, wkday, month, day, year, chamber, pages, extensions

    def write_header(self):
        self.crdoc['id'] = self.access_path
        header = self.get_header()
        if header:
            self.crdoc['header'] = {'vol':header[0],'num':header[1],\
            'wkday':header[2],'month':header[3],'day':header[4],\
            'year':header[5],'chamber':header[6],'pages':header[7],\
            'extension':header[8]}
        self.crdoc['doc_title'] = self.doc_title

    def get_title(self):
        """
        Throw out empty lines
        Parse consecutive title-matching strings into a title str
        Stop on the first line that isn't empty and isn't a title
        Return the title str if it exists.

        We pretty much assume the first title on the page applies
        to everything below it
        """

        title_str = ''
        for line in self.the_text:
            if line == u'':
                pass
            else:
                a_match = re.match(self.re_allcaps, line)
                if a_match:
                    title_str = ' '.join([title_str,a_match.group('title')])
                else:
                    break

        if len(title_str) > 0:
            return title_str.strip()
        else:
            return False

    def write_page(self):
        turn = 0
        itemno = 0
        title = self.get_title()
        the_content = []
        if title:
            self.crdoc['title'] = title
        else:
            self.crdoc['title'] = None
        while self.lines_remaining:
            # while not re.match(self.re_allcaps,self.cur_line):
            try:
                item = crItem(self).item
                if item['kind'] == 'speech':
                    item['turn'] = turn
                    turn += 1
                item['itemno'] = itemno
                itemno += 1
                the_content.append(item)
            except Exception as e:
                logging.warn('{0}'.format(e))
                break

        self.crdoc['content'] = the_content

        logging.debug('Stopped writing {0}. The last line is: {1}'.format(self.access_path,self.cur_line))

    def parse(self):
        """
        Flow control for parsing content.
        """
        self.the_text = self.read_htm_file()
        self.write_header()
        self.write_page()

    """
    This is a dict of line cases.
    In previous versions, these relations were called
    explicitly multiple times in multiple places.

    This way is more extensible and easier to track cases.

    Usage:
    If break_flow == True: <interrupt current item>
    If speaker_re == True: speaker = re.match(line,
                                     <pattern from patterns>).
                                     .group(<speaker_group>)
    else: speaker = <speaker>
    (ALSO -- see line 176 for how speech patterns is populated)
    It has to come after some of the functions because of
    how I want to handle special cases.
    """
    item_types = { 'speech':
                   {'patterns':['Mr. BOEHNER'],
                    'speaker_re':True,
                    'speaker_group':'name',
                    'break_flow':True,
                    'special_case':False
                    },
                    'recorder':
                    {'patterns':[re_recorderstart,
                                 re_recorderend,
                                 re_recorder_ncj],
                    'speaker_re':False,
                    'speaker':'The RECORDER',
                    'break_flow':True,
                    'special_case':False
                    },
                    'clerk':
                    {'patterns':[re_clerk],
                     'speaker_re':False,
                     'speaker':'The Clerk',
                     'break_flow':True,
                     'special_case':False
                     },
                     'linebreak':
                     {'patterns':[re_linebreak],
                      'speaker_re':False,
                      'speaker':'None',
                      'break_flow':True,
                      'special_case':True,
                      'condition':'emptystr'
                      },
                      'excerpt':
                      {'patterns':[re_excerpt],
                       'speaker_re':False,
                       'speaker':'None',
                       'break_flow':True,
                       'special_case':True,
                       'condition':'lastspeaker'
                       },
                      'rollcall':
                      {'patterns':[re_rollcall],
                      'speaker_re':False,
                      'speaker':'None',
                      'break_flow':True,
                      'special_case':False
                      },
                      'metacharacters':
                      {'patterns':[re_timestamp,
                                   re_newpage],
                       'speaker_re':False,
                       'speaker':'None',
                       'break_flow':False,
                       'special_case':False
                       },
                      'empty_line':
                      {'patterns':[r'(^[\s]+$)'],
                       'speaker_re':False,
                       'speaker':'None',
                       'break_flow':False,
                       'special_case':False
                       },
                       'title':
                       {'patterns':[re_allcaps],
                        'speaker_re':False,
                        'speaker':'None',
                        'break_flow':True,
                        'special_case':False,
                        }
                    }

    stemmer = Porter2Stemmer()

    #remove stop words
    def remove_stop_words(self,frequency_list):

	stop_words = []

	with open("stop_words_list.txt") as stop_words_list:

		stop_words_data = stop_words_list.read()
		stop_words = stop_words_data.split(',')


    		temp_list = []
		stemmer = Porter2Stemmer()
		try:
			for c in string.punctuation:
		   		frequency_list= frequency_list.replace(c,"")

			querywords = frequency_list.split()
		
			resultwords  = [stemmer.stem(word) for word in querywords if word.lower() not in stop_words]

			result = ' '.join(resultwords)

		except ValueError:
	     		print "no such value"

	#print(result)    	
	return result


    def __init__(self, abspath, cr_dir, **kwargs):

        # Some metadata
        self.crdoc = {}
        self.crdoc['header'] = False
        self.crdoc['content'] = []
        self.num_titles = 0
        self.speakers = {}
        self.doc_ref = ''
        self.doc_time = -1
        self.doc_start_time = -1
        self.doc_stop_time = -1
        self.doc_duration = -1
        self.doc_chamber = 'Unspecified'
        self.doc_related_bills = []
        
        # file data
        self.filepath = abspath
        self.filedir, self.filename = os.path.split(abspath)
        self.cr_dir = cr_dir
	#print(cr_dir)
        self.access_path = self.filename.split('.')[0]

        # Generate all metadata including list of speakers
        self.gen_file_metadata()
        # Must come after speaker list generation
        self.item_breakers = []
        self.skip_items = []
        for x in list(self.item_types.values()):
            if x['break_flow'] == True:
                self.item_breakers.extend(x['patterns'])
            else:
                self.skip_items.extend(x['patterns'])

        # Parse the file
        self.parse()
	#print( self.crdoc['content'])
	

	for speech in self.crdoc['content']:
		#print("Code running")
		if speech['kind'] == 'speech':
			#print(speech['text'])
			if speech['speaker_bioguide']:
				keybioguideid = speech['speaker_bioguide']
				outpath = os.path.join('','json',keybioguideid+'.json')
			
				with open(outpath) as json_data:
    					d = json.load(json_data)
			
				if d['party']=='D':
					print("D")
					with open('democratic/'+speech['speaker']+'-'+str(keybioguideid)+'.txt','a+') as out_json:
						out_json.write(str(self.remove_stop_words(speech['text']))+'\n')
					os.chmod('democratic/'+speech['speaker']+'-'+str(keybioguideid)+'.txt', 0o777)
				
					with open('democratic_speeches.txt','a+') as out_json:
						out_json.write(str(self.remove_stop_words(speech['text']))+'\n')

				elif d['party'] =='R':
					print("R")
					with open('republican/'+speech['speaker']+'-'+str(keybioguideid)+'.txt','a+') as out_json:
						out_json.write(str(self.remove_stop_words(speech['text']))+'\n')
					os.chmod('republican/'+speech['speaker']+'-'+str(keybioguideid)+'.txt', 0o777)

					with open('republican_speeches.txt','a+') as out_json:
						out_json.write(str(self.remove_stop_words(speech['text']))+'\n')


	'''
Ejemplo n.º 7
0
def index_file(directory, file_name, documentID, index):
    stemmer = Porter2Stemmer()
    # Dealing with punctuation
    p = dict.fromkeys(string.punctuation)
    p.pop('-')  # we need to deal with hyphens
    weight_map = {}

    try:
        with open(directory + file_name) as txt_file:
            # Trying to normalize the vocab, getting rid of non alphanumeric,
            body = txt_file.read().replace('\n', '').lower()
            body = re.sub(r'[^A-Za-z0-9#]+', ' ', body)
            body = body.split(
                ' ')  # Gets rid of any \n that appear in the text

            body = list(
                filter(lambda t: t != '' and t != '-',
                       body))  # remove single spaces and single hyphens

            position = 0
            for term in body:
                # take care of hyphenated words
                if '-' in term:
                    unhyphenated_word = term.replace('-', '')
                    index.add_term(stemmer.stem(unhyphenated_word), documentID,
                                   position)
                    hyphened_tokens = term.split('-')
                    for t in hyphened_tokens:
                        all_docs_index.add_term(stemmer.stem(t), documentID,
                                                position)
                else:
                    index.add_term(stemmer.stem(term), documentID, position)
                position += 1
                if term not in weight_map:
                    weight_map[term] = 1
                else:
                    weight_map[term] = weight_map[term] + 1

    except FileNotFoundError as e:
        print(e)

    doc_total_tf[
        file_name] = weight_map  # given a document, it will return a map of that docs tf

    score_map = {}
    wdt = 0
    # Gets the Wdt's of the terms in the file
    for tf in weight_map:
        score = pow(1 + log(weight_map[tf]), 2)
        score_map[tf] = score
        wdt += score**2
    Ld = sqrt(wdt)
    length = 0
    for tf in score_map:
        score_map[tf] = score_map[tf] / Ld
        length += score_map[tf]**2

    doc_wdt[file_name] = score_map
    # Things to turn in for Neal, it's just the easiest place to put this
    if file_name == 'paper_52.txt':
        print('First 30 components of document 52')
        get_first_thirty(score_map, True)
Ejemplo n.º 8
0
def stemming():
    stemmer = Porter2Stemmer()
    for w in words():
        print(w, stemmer.stem(w), sep='\t')
Ejemplo n.º 9
0
 def stemming_tokenizer(self, text):
     stemmer = Porter2Stemmer()
     return [stemmer.stem(w) for w in word_tokenize(text)]