def extract_dictionary(document_paths, max_words):
        """
        Extracts a gensim Dictionary object from a set of documents.

        Parameters
        ----------
        document_paths : [str]
            List of document paths that make up the corpus.
        
        Returns
        -------
        dictionary : gensim.corpora.Dictionary
            Extracted dictionary (or tokenizer).
        """
        print("Extracting dictionary from corpus")
        dictionary = Dictionary(prune_at=None)
        preprocessor = TextPreprocessor()
        for document_path in tqdm(document_paths):
            with open(document_path, "r") as f:
                document = f.read()
            document = preprocessor.clean_sentence(document,
                                                   alphabetic_only=True)
            words = preprocessor.tokenize_text(document)
            dictionary.add_documents([words])
            if len(dictionary) > max_words:
                start = time()
                dictionary.filter_extremes(no_below=10,
                                           no_above=0.5,
                                           keep_n=int(max_words * 0.9))
                print("Dictionary filtered in {} seconds".format(time() -
                                                                 start))
        return dictionary
Beispiel #2
0
def preclean_xlsxParagraph(text_list):
    preprocessor = TextPreprocessor()
    word_list = list()
    for text in text_list:
        clean_text = preprocessor.clean_sentence(text)
        word_list.append(' '.join(preprocessor.tokenize_text(clean_text)))
    return (word_list)
class BM25Engine(SearchEngine):
    tokenizer_file = "data/processed/{id}/tokenizer"
    corpus_file = "data/processed/{id}/corpus"
    doc_idxs_file = "data/processed/{id}/doc_idxs.json"

    def __init__(self, tokenizer, corpus, idxs2id):
        """
        Parameters
        ----------
        tokenizer : gensim.corpora.Dictionary
            Word tokenizer.
        corpus : gensim.corpora.mmcorpus.MmCorpus
            Bag-of-words formatted corpus of documents.
        """
        self.preprocessor = TextPreprocessor()
        self.tokenizer = tokenizer
        self.corpus = corpus
        self.internal_engine = BM25(self.corpus)
        self.idxs2id = idxs2id
        print("BM25 engine loaded")

    def top_k_matches(self, query, k):
        clean = self.preprocessor.clean_sentence(query, alphabetic_only=True)
        word_list = self.preprocessor.tokenize_text(clean)
        bow_representation = self.tokenizer.doc2bow(word_list)
        scores = self.internal_engine.get_scores(bow_representation)
        top_k_idxs = np.argsort(scores)[::-1][:k]
        return [self.idxs2id[str(idx)] for idx in top_k_idxs]

    def load(id):
        tokenizer = Dictionary.load(BM25Engine.tokenizer_file.format(id=id))
        corpus = MmCorpus(BM25Engine.corpus_file.format(id=id))
        with open(BM25Engine.doc_idxs_file.format(id=id), "r") as f:
            idxs2id = json.load(f)
        return BM25Engine(tokenizer, corpus, idxs2id)
 def __init__(self, document_paths, max_words):
     self.tokenizer = IterativeCorpusBuilder.extract_dictionary(
         document_paths, max_words=max_words)
     self.document_paths = iter(document_paths)
     self.preprocessor = TextPreprocessor()
     self.clock = time()
     self.iterations = 0
     self.inform_frequency = 1000
     self.max_words = 2 * 10**6
 def __init__(self, tokenizer, corpus, idxs2id):
     """
     Parameters
     ----------
     tokenizer : gensim.corpora.Dictionary
         Word tokenizer.
     corpus : gensim.corpora.mmcorpus.MmCorpus
         Bag-of-words formatted corpus of documents.
     """
     self.preprocessor = TextPreprocessor()
     self.tokenizer = tokenizer
     self.corpus = corpus
     self.internal_engine = BM25(self.corpus)
     self.idxs2id = idxs2id
     print("BM25 engine loaded")
Beispiel #6
0
    def highlight(self, document, query, precision):
        '''document must be the json document to be able to extract page'''
        highlights = []
        scores = []
        pages = []
        if isinstance(document, dict):
            for page_num, text in document.items():
                page_num = page_num.split("_")[1]
                for sentence in text.split("\n\n"):
                    sentence = re.sub("\n", " ", sentence)
                    sentence = re.sub(" +", " ", sentence)
                    sentence = sentence.strip()
                    if len(sentence) < 60:
                        continue
                    score = self._model.get_similarity(sentence, query)
                    if score > precision:
                        highlights.append(sentence)
                        scores.append(score)
                        pages.append(page_num)
            sorted_idxs = np.argsort(scores)[::-1]
            highlights = [highlights[idx] for idx in sorted_idxs]
            scores = [scores[idx] for idx in sorted_idxs]
            pages = [pages[idx] for idx in sorted_idxs]

            return highlights, scores, pages

        else:
            preprocessor = TextPreprocessor()
            clean_text = preprocessor.clean_sentence(document)
            paragraphs = preprocessor.split_into_paragraphs(document)

            for paragraph in paragraphs:
                paragraph = re.sub("\n", " ", sentence)
                paragraph = re.sub(" +", " ", sentence)
                paragraph = paragraph.strip()
                if len(paragraph) < 60:
                    continue
                score = self._model.get_similarity(paragraph, query)
                if score > precision:
                    highlights.append(paragraph)
                    scores.append(score)
            sorted_idxs = np.argsort(scores)[::-1]
            highlights = [highlights[idx] for idx in sorted_idxs]
            scores = [scores[idx] for idx in sorted_idxs]

            return highlights, scores, None
Beispiel #7
0
class BoWHighlighter(SegmentHighlighter):
    def __init__(self):
        self.preprocessor = TextPreprocessor()
        self.counter = CountVectorizer()

    def highlight(self, document, query, precision):
        clean_query = self.preprocessor.clean_sentence(
            query)  #, alphabetic_only=True
        word_list = self.preprocessor.tokenize_text(clean_query)
        #Convert query after tokenization as a string
        query_mod = ' '.join(word_list)
        document = re.sub(" +", " ", document)
        document = re.sub("\n\s+", " \n \n ", document)
        document = re.sub("\n+", "\n", document)
        # document = re.sub("\n*", "\n\n ", document)
        document = document.strip()
        highlights = []
        scores = []
        for paragraph in document.split(" \n \n "):
            if len(paragraph) == 0:
                continue
            clean_paragraph = self.preprocessor.clean_sentence(
                paragraph)  #, alphabetic_only=True
            corpus = self.preprocessor.tokenize_text(clean_paragraph)
            paragraph_mod = ' '.join(corpus)
            vectorizer = self.counter.fit([query_mod, paragraph_mod])
            vectors = [
                vec for vec in vectorizer.transform([query_mod, paragraph_mod
                                                     ]).toarray()
            ]
            norm_vec_query = np.linalg.norm(vectors[0])
            norm_vec_paragraph = np.linalg.norm(vectors[1])
            if norm_vec_paragraph == 0: continue
            cosine_similarity = np.dot(
                vectors[0], vectors[1]) / (norm_vec_query * norm_vec_paragraph)
            if cosine_similarity > precision:
                print("The cosine similary is: ", cosine_similarity, paragraph,
                      "\n \n")
                highlights.append(paragraph)
                scores.append(cosine_similarity)
        return highlights, scores

    def load(id):
        return BoWHighlighter()
Beispiel #8
0
def preclean_entireDoc(text):
    preprocessor = TextPreprocessor()
    clean_text = preprocessor.clean_sentence(text)
    word_list = preprocessor.tokenize_text(clean_text)
    words = ' '.join(word_list)
    return (words)
Beispiel #9
0
min_paragraph_length = 60
substring_similarity_threshold = 60
blacklist = [
    "Presupuesto de Egresos de la Federación", "Sembrando Vida",
    "poseedores de terrenos forestales "
]

if __name__ == "__main__":
    # Parse command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("-dataset_dir", type=str)
    parser.add_argument("-tags_path", type=str)
    args = vars(parser.parse_args())
    # Instantiate model
    sbert = SentenceEmbeddings("xlm-r-100langs-bert-base-nli-stsb-mean-tokens")
    preprocessor = TextPreprocessor()
    # Iterate over tags file
    tags = pd.read_excel(args["tags_path"], sheet_name=None)
    scores = []
    labels = []
    incentive_paragraphs = []
    for country, country_tags in tags.items():
        print("Processing data available for '{}'".format(country))

        # Get the available documents' paths
        def get_txt_path(pdf_path):
            if isinstance(pdf_path, str):
                txt_path = os.path.basename(pdf_path).replace(".pdf", ".txt")
                return os.path.join(args["dataset_dir"], country, "txt",
                                    txt_path)
            else:
Beispiel #10
0
                    help='''
If True, keeps capitalization. Set to True by default.
''')
parser.add_argument('--join',
                    nargs='?',
                    default=False,
                    type=bool,
                    help='''
If true, joins all files to a single file. Set to False by default.
''')
parser.add_argument('--temp_files_path',
                    nargs='?',
                    default='data/tmp',
                    type=str,
                    help='''
Directory to store temp files during preprocessing. Set to data/tmp by default
''')
parser.add_argument('--output_filename',
                    nargs='?',
                    default='preprocessed.txt',
                    type=str,
                    help='''
If exporting a single file, the filename of this exported file. Set to preprocessed.txt by default.
''')
args = parser.parse_args()

if __name__ == "__main__":
    nltk.download('stopwords')
    nltk.download('punkt')
    TextPreprocessor().text_preprocessor_main(args)
class IterativeCorpusBuilder():
    def __init__(self, document_paths, max_words):
        self.tokenizer = IterativeCorpusBuilder.extract_dictionary(
            document_paths, max_words=max_words)
        self.document_paths = iter(document_paths)
        self.preprocessor = TextPreprocessor()
        self.clock = time()
        self.iterations = 0
        self.inform_frequency = 1000
        self.max_words = 2 * 10**6

    def __next__(self):
        with open(next(self.document_paths), "r") as f:
            document = f.read()
        document = self.preprocessor.clean_sentence(document,
                                                    alphabetic_only=True)
        words = self.preprocessor.tokenize_text(document)
        bow_representation = self.tokenizer.doc2bow(words)
        # Inform progress as specified
        self.iterations += 1
        if self.iterations % self.inform_frequency == 0:
            print("{} iterations took {} seconds. {} done.".format(
                self.inform_frequency,
                time() - self.clock, self.iterations))
            self.clock = time()
        return bow_representation

    def __iter__(self):
        print("Building corpus term-document matrices")
        return self

    def extract_dictionary(document_paths, max_words):
        """
        Extracts a gensim Dictionary object from a set of documents.

        Parameters
        ----------
        document_paths : [str]
            List of document paths that make up the corpus.
        
        Returns
        -------
        dictionary : gensim.corpora.Dictionary
            Extracted dictionary (or tokenizer).
        """
        print("Extracting dictionary from corpus")
        dictionary = Dictionary(prune_at=None)
        preprocessor = TextPreprocessor()
        for document_path in tqdm(document_paths):
            with open(document_path, "r") as f:
                document = f.read()
            document = preprocessor.clean_sentence(document,
                                                   alphabetic_only=True)
            words = preprocessor.tokenize_text(document)
            dictionary.add_documents([words])
            if len(dictionary) > max_words:
                start = time()
                dictionary.filter_extremes(no_below=10,
                                           no_above=0.5,
                                           keep_n=int(max_words * 0.9))
                print("Dictionary filtered in {} seconds".format(time() -
                                                                 start))
        return dictionary
Beispiel #12
0
 def __init__(self):
     self.preprocessor = TextPreprocessor()
     self.counter = CountVectorizer()