Esempio n. 1
0
def create_contract_graph(directory, title):
    """
    Generates a bar graph with the labels as the folders in the directory and the
    bar heights as the number of documents in each folder

    Params
    directory: str where the folders are located
    title: str title for bar graph
    """
    ds = DirectoryAssistor()
    folder_lst = ds.create_content_list(directory)
    folder_dic = {}
    for folder in folder_lst:
        folder_dic[folder] = len(ds.create_content_list(directory + folder))
    sort_folder_dic = sorted(folder_dic.items(),
                             key=lambda x: x[1],
                             reverse=True)
    cols = []
    labels = []
    for i in sort_folder_dic:
        labels.append(i[0])
        cols.append(i[1])
    fig, ax = plt.subplots(figsize=(10, 10))
    # cols = list(new_dic.values())
    # labels = list(new_dic.keys())
    make_sns_bar_plot(ax, cols, labels, title=title)
Esempio n. 2
0
class ContractPipeline():
    def __init__(self, directory, stop_words):
        """
        Instantiates a ContractPipeline Object

        Params
        directory: str for main directory where the folders for the documntes are stored
        stop_words: list of words that will be removed from the documents
        """
        self.directory = directory
        self.stop_words = stop_words
        self.ds = DirectoryAssistor()
        self.individual_bag_o_words = {}

    def get_list_of_docs(self):
        """
        Creates
        target_lst: list that has all of the types for each document in the
        corresponding index of doc_text_lst
        doc_text_lst: list of lowercased cleaned strings for the text in each document
        """
        print('Converting to txt lists')
        start_time = time.time()

        folder_lst = self.ds.create_content_list(self.directory)
        doc_lst = []
        self.target_lst = []
        self.doc_text_lst = []

        for i in range(len(folder_lst)):
            doc_lst.append(
                self.ds.create_content_list(self.directory + folder_lst[i]))
            self.individual_bag_o_words[folder_lst[i]] = []
            for j in range(len(doc_lst[i])):
                text = textract.process(self.directory + folder_lst[i] + '/' +
                                        doc_lst[i][j])
                # convert to str
                text = text.decode('utf-8')
                # lowercase all text
                text = text.lower()
                # remove all punctuation
                text = re.sub(r'\W+', ' ', text)
                # remove underscores
                text = text.replace("_", "")
                self.doc_text_lst.append(text)
                self.target_lst.append(folder_lst[i])
                self.individual_bag_o_words[folder_lst[i]].append(text.split())
                lst = []
        for val in self.target_lst:
            lst.append(val.replace('_', ' '))
        self.target_lst = lst

        end_time = time.time()
        print(f'This took {end_time-start_time:.2f} seconds')

    def get_list_of_txts(self):
        """
        Creates
        target_lst: list that has all of the types for each document in the
        corresponding index of doc_text_lst
        doc_text_lst: list of lowercased cleaned strings for the text in each document
        """
        print('Converting to txt lists')
        start_time = time.time()

        folder_lst = self.ds.create_content_list(self.directory)
        doc_lst = []
        self.target_lst = []
        self.doc_text_lst = []

        for i in range(len(folder_lst)):
            doc_lst.append(
                self.ds.create_content_list(self.directory + folder_lst[i]))
            self.individual_bag_o_words[folder_lst[i]] = []
            for j in range(len(doc_lst[i])):
                # read in file as str
                try:
                    with open(
                            self.directory + folder_lst[i] + '/' +
                            doc_lst[i][j], 'r') as f:
                        text = f.read().replace('\n', '')
                except:
                    continue

                # lowercase all text
                text = text.lower()

                # remove all punctuation
                text = re.sub(r'\W+', ' ', text)

                # remove underscores
                text = text.replace("_", "")

                self.doc_text_lst.append(text)
                self.target_lst.append(folder_lst[i])
                self.individual_bag_o_words[folder_lst[i]].append(text.split())
        lst = []
        for val in self.target_lst:
            lst.append(val.replace('_', ' '))
        self.target_lst = lst

        end_time = time.time()
        print(f'This took {end_time-start_time:.2f} seconds')

    def bag_o_words(self):
        print('Creating bag o words')
        start_time = time.time()

        for key in self.individual_bag_o_words.keys():
            lst = []
            for val in self.individual_bag_o_words[key]:
                for word in val:
                    lst.append(word)
            self.individual_bag_o_words[key] = Counter(lst)
        total_word_lst = []
        for i in self.doc_text_lst:
            lst = i.split()
            for j in lst:
                total_word_lst.append(j)
        self.total_bag_o_words = Counter(total_word_lst)

        end_time = time.time()
        print(f'This took {end_time-start_time:.2f} seconds')

    def join_list_of_strings(self, lst):
        """
        Joins the list into a string
        
        Params
        lst: list of words
        """
        return [" ".join(x) for x in lst]

    def remove_stop_words(self):
        """
        Returns a new list of strings with stop words removed

        stops_removed_str: list of strings with stop words removed
        stops_removed_lst: list of lists containing words with stops removed
        """
        print('Removing stop words')
        start_time = time.time()

        split_lst = [txt.split() for txt in self.doc_text_lst]
        self.stops_removed_lst = []

        for split in split_lst:
            stops = [w for w in split if w not in self.stop_words]
            stop_num = [
                w for w in stops
                if not (w.isdigit() or w[0] == '-' and w[1:].isdigit())
            ]
            self.stops_removed_lst.append(stop_num)

        self.stops_removed_str = self.join_list_of_strings(
            self.stops_removed_lst)

        end_time = time.time()
        print(f'This took {end_time-start_time:.2f} seconds')

    def word_condenser(self):
        """
        Takes in a list of strings and lemmatizes or stems them depending on the
        technique chosen

        self.porter_str: list of strings with porter stem technique used
        self.snowball_str: list of strings with snowball stem technique used
        self.wordnet_str: list of strings with wordnet lemmatize technique used
        """
        print('Condensing')
        start_time = time.time()

        porter = PorterStemmer()
        snowball = SnowballStemmer('english')
        wordnet = WordNetLemmatizer()

        porter_lst = [[porter.stem(w) for w in words]
                      for words in self.stops_removed_lst]
        snowball_lst = [[snowball.stem(w) for w in words]
                        for words in self.stops_removed_lst]
        wordnet_lst = [[wordnet.lemmatize(w) for w in words]
                       for words in self.stops_removed_lst]

        self.porter_str = self.join_list_of_strings(porter_lst)
        self.snowball_str = self.join_list_of_strings(snowball_lst)
        self.wordnet_str = self.join_list_of_strings(wordnet_lst)

        end_time = time.time()
        print(f'This took {end_time-start_time:.2f} seconds')

    def count_vectorizer(self, max_features=None, ngram_range=(1, 1)):
        """
        Sets up a word count matrix, a tfidf matrix, and a CountVectorizer for
        the documents in the directory

        Params
        documents: list of strings to be vectorized

        Returns
        count_matrix: matrix with word counts
        tfidf_matrix: a tfidf matrix of the documents
        cv: CountVectorizer object for the documents
        """
        print('Generating tfidf and count matrix')
        start_time = time.time()

        self.cv = CountVectorizer(max_features=max_features,
                                  ngram_range=ngram_range)
        self.tf_matrix = self.cv.fit_transform(self.stops_removed_str)

        end_time = time.time()
        print(f'This took {end_time-start_time:.2f} seconds')

    def tf_vect(self, documents, max_features=None, ngram_range=(1, 1)):
        """
        Returns tf-idf matrix from documents
        
        Prams
        documents: list of strings
        """
        print('Generating tfidf')
        start_time = time.time()

        self.vect = TfidfVectorizer(max_features=max_features,
                                    ngram_range=ngram_range)
        self.tfidf = self.vect.fit_transform(documents)

        end_time = time.time()
        print(f'This took {end_time-start_time:.2f} seconds')
class ContractSifter():

    def __init__(self, directory, stop_words):
        self.directory = directory
        self.ds = DirectoryAssistor()
        self.stop_words = stop_words
        self.porter_dic = {}
        self.snowball_dic = {}
        self.wordnet_dic = {}
        self.combined_dic = {}

    def create_dic(self):
        """
        Returns a dictionary with folder names as the keys and an empty lst 
        as values

        params

        folder_names: list of folder names in the directory

        Returns 
        Dictionary
        """
        lst = self.ds.create_content_list(self.directory)
        word_dic = {key: [] for key in lst}
        return word_dic
    
    def remove_stop_words(self, lst):
        return [w for w in lst if w not in self.stop_words]
    
    def add_words(self):
        """
        Adds words from the files in the directories that are associated with
        the keys in the self.word_dic

        Returns
        self.word_dic with a list of words with the following removed from each 
        file in the folder for that key:
            1. Stop words
            2. Punctuation
            3. Underscores
        """
        self.word_dic = self.create_dic()
        for key in self.word_dic.keys():
            lst = self.ds.create_content_list(self.directory+key)
            for file in lst:
                full_text = textract.process(self.directory+key+'/'+file)
                str_full_text = full_text.decode('utf-8')
                lower_full_text = str_full_text.lower()
                edited_text = re.sub(r'\W+', ' ', lower_full_text)
                edited_text = edited_text.replace("_","")
                tokens = word_tokenize(edited_text)
                stop_lst = self.remove_stop_words(tokens)
                self.word_dic[key].append(stop_lst)
    
    def combine_word_lists(self):
        """
        Combine all of the lists for a key into one list from the Pipeline
        word_dic attribute
        """
        for key in self.word_dic.keys():
            result = []
            for lst in self.word_dic[key]:
                result.extend(lst)
            self.combined_dic[key] = result
    
    def word_condenser(self):
        
        porter = PorterStemmer()
        snowball = SnowballStemmer('english')
        wordnet = WordNetLemmatizer()
        for key in self.combined_dic.keys():
            porter_lst = []
            snowball_lst = []
            wordnet_lst = []
            for word in self.combined_dic[key]:
                porter_lst.append(porter.stem(word))
                snowball_lst.append(snowball.stem(word))
                wordnet_lst.append(wordnet.lemmatize(word))
            self.porter_dic[key] = porter_lst
            self.snowball_dic[key] = snowball_lst
            self.wordnet_dic[key] = wordnet_lst
    
    def word_count(self, dic):
        """
        Returns the count of the words in each key of the dictionary

        Params

        dic = dict for which the words will be counted

        Returns

        new_dic: dict with word count for each key
        """
        word_count_dic = {}
        for key, val in dic.items():
            word_count_dic[key] = Counter(val)
        new_dic = dict(word_count_dic)
        return new_dic

    def word_cloud(self, dic):
        """
        Generates a word cloud for each key in the dic

        Params

        dic: dict for which the word cloud will be generated

        Returns

        Plot with word cloud for each key in dic
        """

        word_cloud_dic = {}
        for key, val in dic.items():
            word_cloud_dic[key] = ' '.join(val)
        wc_lst = []
        for val in word_cloud_dic.values():
            wc = WordCloud(width=1000, height=1000, background_color='white', 
                            min_font_size=9)
            wc_lst.append(wc.generate(val))
        fig, axs = plt.subplots(3,3, figsize=(15,12))
        titles = list(dic.keys())
        for cloud, title, ax in zip(wc_lst, titles, axs.flatten()):
            chartBox = ax.get_position()
            ax.set_position(pos=[chartBox.x0,chartBox.y0,chartBox.width*1.05,
                                        chartBox.height*1.05])
            ax.imshow(cloud)
            ax.set_title(title, fontsize=16, weight='bold')
            ax.axis("off")
        axs[2,1].set_axis_off()
        axs[2,2].set_axis_off()
        chartBox = axs[2,0].get_position()
        axs[2,0].set_position(pos=[chartBox.x0*2.8,chartBox.y0*.9,chartBox.width*1.05,
                                    chartBox.height*1.05])
        plt.show()
    
    for img in image_list:
            da.delete_files(img)

    merge_pages(page_lst=page_lst, out_dir=out_dir, file_name=file_name)
    
    for page in page_lst:
        da.delete_files(out_dir+page)



 
if __name__ == '__main__':
    directory='/Users/justinlansdale/Documents/Galvanize/Capstone3/EC2Data/\
changeFolder/pdfs/Original/'
    out_dir = '/Users/justinlansdale/Documents/Galvanize/Capstone3/EC2Data/\
changeFolder/Amended/'

    da = DirectoryAssistor()
    doc_lst = da.create_content_list(directory)
    print(doc_lst)
    start = time.time()
    
    for i in range(len(doc_lst)):
        file_name = doc_lst[i]
        try:
            main(directory, file_name, out_dir)
        except:
            continue
    end = time.time()
    print(end-start)
class AmendmentDetector():
    def __init__(self, original_doc_dir, amended_doc_dir, original_txt_dir,
                 amended_txt_dir):
        """
        Instantiates the amendment detection class
        
        Params:

        original_doc_dir: str directory where the original document is located
        amended_doc_dir: str directory where the amended document is located
        original_txt_dir: str directory where the original txt file will be 
        storeed
        amended_txt_dir: str directory where the amended txt file will be 
        stored
        """

        self.original_doc_dir = original_doc_dir
        self.amended_doc_dir = amended_doc_dir
        self.original_txt_dir = original_txt_dir
        self.amended_txt_dir = amended_txt_dir
        self.da = DirectoryAssistor()

    def convert_original_to_txt(self):
        """
        Converts original doct to .txt file
        """

        doc_lst = self.da.create_content_list(self.original_doc_dir)

        for i in range(len(doc_lst)):
            file_name = doc_lst[i]
            try:
                main(self.original_doc_dir, file_name, self.original_txt_dir)
            except:
                continue

    def convert_amended_to_txt(self):
        """
        Converts amended doc to .txt file
        """
        doc_lst = self.da.create_content_list(self.amended_doc_dir)

        for i in range(len(doc_lst)):
            file_name = doc_lst[i]
            try:
                main(self.amended_doc_dir, file_name, self.amended_txt_dir)
            except:
                continue

    def read_in_files(self):
        """
        Reads in txt files
        """

        original_lst = self.da.create_content_list(self.original_txt_dir)
        amended_lst = self.da.create_content_list(self.amended_txt_dir)
        for doc in original_lst:
            try:
                with open(self.original_txt_dir + doc, 'r') as f:
                    self.original = f.read().replace('\n', ' ')
            except:
                continue

        for doc in amended_lst:
            try:
                with open(self.amended_txt_dir + doc, 'r') as f:
                    self.amended = f.read().replace('\n', ' ')
            except:
                continue

    def print_changes(self):
        """
        Prints changes made in the amended document
        """

        original_lst = self.original.split()
        amended_lst = self.amended.split()
        original_value = []
        amended_value = []
        change_ref = []
        for i in range(len(original_lst)):
            if original_lst[i] != amended_lst[i]:
                original_value.append(original_lst[i])
                change_ref.append(list(original_lst[i - 40:i + 40]))
                amended_value.append(amended_lst[i])

        for i in range(len(original_value)):
            print(f'\n Change # {i+1}: {original_value[i]} changed to \
{amended_value[i]} \n \n Reference text: {" ".join(change_ref[i])} \n')