Beispiel #1
0
    def calc_tfidf(self, corpus_path: str) -> None:
        """Calculate all the TFIDF values for a given corpus file.

        Documents are separated by an empty line. There is one sentence
        per line. Write the scores to the file 'tfidf_scores.txt'. The
        File has the following format:
        lemma_id score-doc-1 score-doc-2 ...

        Args:
            corpus_path: path to the corpus file
        """
        docs = utility_functions.get_docs(corpus_path, sent_tokenized=False)
        tfidf = TfidfVectorizer(analyzer='word',
                                tokenizer=identity_function,
                                preprocessor=identity_function,
                                token_pattern=None)
        print('fit-transform tfidf')
        tfidf_matrix = tfidf.fit_transform(docs).toarray()
        lemma_ids = tfidf.get_feature_names()
        path_out = os.path.join(self.path_out, 'tfidf.txt')
        print('write tfidf to file...')
        with open(path_out, 'w', encoding='utf8') as f:
            for i, row in enumerate(zip(*tfidf_matrix)):
                lemma_id = lemma_ids[i]
                values = ' '.join(['{:.2f}'.format(x) for x in row])
                line = '{} {}'.format(lemma_id, values)
                f.write(line)
Beispiel #2
0
def test_all_documents_contain_words(path_pp_ling_corpus: str,
                                     path_pp_token_corpus: str,
                                     path_pp_lemma_corpus: str) -> None:
    """Test if all documents in the pp-files contain words."""
    for i, doc in enumerate(
            get_docs(path_pp_ling_corpus,
                     sent_tokenized=False,
                     word_tokenized=False)):
        if len(doc) == 0:
            raise Exception('Document {} is empty.'.format(i))
    for i, doc in enumerate(
            get_docs(path_pp_token_corpus,
                     sent_tokenized=False,
                     word_tokenized=False)):
        if len(doc) == 0:
            raise Exception('Document {} is empty.'.format(i))
    for i, doc in enumerate(
            get_docs(path_pp_lemma_corpus,
                     sent_tokenized=False,
                     word_tokenized=False)):
        if len(doc) == 0:
            raise Exception('Document {} is empty.'.format(i))
Beispiel #3
0
    def index(
        self,
        path_in: str,
        path_out: str,
        # path_terms: str
    ) -> Tuple[Dict[str, int], Dict[int, str]]:
        """Create an index representation for the input corpus.

        Create a file where the corpus is represented with token
        indices.
        Create a file where the corpus is represented with lemma
        indices.
        Create json files to map tokens/lemmas to idxs and.
        Create json files to map idxs to tokens/lemmas and.
        """
        self._docs_processed = 0
        self._start_time = time.time()

        # terms = set()
        # with open(path_terms, 'r', encoding='utf8') as fin:
        #     for line in fin:
        #         terms.add(line.strip('\n'))

        word_to_idx = {}
        idx_to_word = {}
        i = 0
        corpus_idx = []
        for doc in get_docs(path_in):
            doc_idx = []
            for sent in doc:
                for word in sent:
                    if word not in word_to_idx:
                        word_to_idx[word] = i
                        idx_to_word[i] = word
                        i += 1
                idx_sent = [word_to_idx[word] for word in sent]
                doc_idx.append(idx_sent)
            corpus_idx.append(doc_idx)
            # doc_idx = []
            self._docs_processed += 1
            self._update_cmd_counter()

            if self._docs_processed % self._file_write_threshhold == 0:
                self._update_cmd_time_info()
                self.write_corpus(corpus_idx, path_out)
                corpus_idx = []

        self._update_cmd_time_info(end=True)
        self.write_corpus(corpus_idx, path_out)
        self._already_written_to_file = False
        return word_to_idx, idx_to_word
Beispiel #4
0
    def calc_tf(self, level) -> None:
        """Calculate the term frequency of each term for each document.

        Create a file tf_lemmas.txt, which has one dictionary per line:
        {lemma_idx: tf}
        The dictionary of line 1 belongs to document 1 etc.
        """
        if level == 't':
            term_idxs = self._load_term_idxs('t')
            path_in = self.path_token_idx_corpus
            path_out = self.path_tf_tokens
            # contains = self.load_contains(self.path_token_contains)
        elif level == 'l':
            term_idxs = self._load_term_idxs('l')
            path_in = self.path_lemma_idx_corpus
            path_out = self.path_tf_lemmas
            # contains = self.load_contains(self.path_lemma_contains)
        else:
            raise Exception('Error! Level not know!')

        tf = {}
        for doc_idx, doc in enumerate(get_docs(path_in)):
            tf[doc_idx] = {}
            tf_doc = tf[doc_idx]
            for sent in doc:
                for term_idx in sent:
                    if term_idx in term_idxs:
                        if term_idx in tf_doc:
                            tf_doc[term_idx] += 1
                        else:
                            tf_doc[term_idx] = 1
                        # *** Code for compound counting ***
                        # # add counts for all terms contained
                        # for tc_idx in contains[term_idx]:
                        #     if tc_idx in tf_doc:
                        #         tf_doc[str(tc_idx)] += 1
                        #     else:
                        #         tf_doc[str(tc_idx)] = 1

                        # print(doc_idx, lemma_idx)
                        # print(type(doc_idx), type(lemma_idx))
                        # tf[doc_id][lemma_idx] += 1
                        # tf_doc = tf[doc_idx]
                        # tf_doc[lemma_idx]
                        # ***
        with open(path_out, 'w', encoding='utf8') as f:
            json.dump(tf, f)

        self._docs_processed = 0
Beispiel #5
0
    def calc_df(self, level: str) -> None:
        """Calculate the document frequency for all terms.

        Do plus one smoothing to avoid zero division errors. Write
        output to 'frequency_analysis/df_<tokens/lemmas>.json' in form
        of a dict: {<term_id>: [doc_id1, ...]}

        Args:
            level: 't' if tokens, 'l' if lemmas.
        """
        if level == 't':
            path_df = self.path_df_tokens
            path_idx_corpus = self.path_token_idx_corpus
            # contains = self.load_contains(self.path_token_contains)
        elif level == 'l':
            path_df = self.path_df_lemmas
            path_idx_corpus = self.path_lemma_idx_corpus
            # contains = self.load_contains(self.path_lemma_contains)
        else:
            raise Exception('Error! Level not know!')

        term_idxs = self._load_term_idxs(level)
        df = defaultdict(list)
        for i, doc in enumerate(get_docs(path_idx_corpus,
                                         sent_tokenized=False)):
            for term_idx in set(doc):
                if term_idx in term_idxs:
                    df[term_idx].append(i)

                    # *** Code for compound counting ***
                    # # add contained words to df
                    # for tc_idx in contains[term_idx]:
                    #     df[str(tc_idx)].append(i)
                    # ***
            self._docs_processed += 1
            self._update_cmd_counter()

        msg = '{} documents processed'
        print(msg.format(self._docs_processed))
        with open(path_df, 'w', encoding='utf8') as f:
            json.dump(df, f)

        self._docs_processed = 0
Beispiel #6
0
    def calc_dl(self) -> None:
        """Compute the number of tokens for each document (doc length).

        Store the average length at index -1. Write output to
        'frequency_analysis/dl.json' in form of a dict: {doc_id: length}
        """
        dl = {}
        for doc in get_docs(self.path_token_idx_corpus, sent_tokenized=False):
            dl[self._docs_processed] = len(doc)
            self._docs_processed += 1
            self._update_cmd_counter()

        msg = '{} documents processed'
        print(msg.format(self._docs_processed))

        print('Calculate mean length...')
        dl[-1] = mean([dl[i] for i in dl])

        with open(self.path_dl, 'w', encoding='utf8') as f:
            json.dump(dl, f)

        self._docs_processed = 0
Beispiel #7
0
def build_corpus_file(doc_ids: Set[int], path_base_corpus: str,
                      cur_node_id: int, path_out: str) -> str:
    """Generate corpus file from document ids.

    Args:
        doc_ids: The ids of the document belongig that make up the
            corpus.
        path_base_corpus: Path to the corpus file with all documents.
        cur_node_id: Id of the current node. Used for the name of the
            corpus file.
        path_out: Path to the output directory.
    Return:
        The path to the generated corpus file:
        'processed_corpora/<cur_node_id>_corpus.txt'
    """
    p_out = os.path.join(path_out,
                         'processed_corpus/{}.txt'.format(cur_node_id))

    # Buffer to store n number of docs. (less writing operations)
    docs_str = ''
    # yields sentences as strings
    with open(p_out, 'w', encoding='utf8') as f_out:
        for i, doc in enumerate(
                get_docs(path_base_corpus, word_tokenized=False)):
            if i in doc_ids:
                doc_str = ''
                for sent in doc:
                    line = sent + '\n'
                    doc_str += line
                doc_str += '\n'
                docs_str += doc_str

                if i % 1000 == 0:
                    f_out.write(docs_str)
                    docs_str = ''
        f_out.write(docs_str)

    return p_out
Beispiel #8
0
def load_corpus(path):
	corpus = {}
	for i, doc in enumerate(get_docs(path)):
		corpus[i] = doc
	return corpus
Beispiel #9
0
 def __iter__(self):
     for doc in get_docs(self.path):
         for sent in doc:
             yield sent
def embed_terms(path_term_to_idxs: str, path_in: str, path_out: str,
                start_num: int) -> None:
    """Create elmo embeddings for given corpus.

    Args:
        path_term_to_idxs: Path to the terms.
        path_in: Path to input file.
        path_out: Path to output file.
        start_num: The starting doc id.

    term_embs_per_doc: {term_idx: {doc_idx: list of embeddings}}
    """
    dump_counter = 0
    print('Loading terms...')
    with open(path_term_to_idxs, 'r', encoding='utf8') as f:
        terms_to_idxs = json.load(f)

    print('Instanciating ELMo...')
    elmo = ElmoE()
    term_embs_per_doc = {}

    for i, doc in enumerate(get_docs(path_in)):
        doc_id = start_num + i
        # print(30*'-')
        print('processing {}...'.format(doc_id))
        # print('doc_id: {}, doc: {}'.format(doc_id, doc))
        for sent in doc:
            sent_terms = []
            for j in range(len(sent)):
                word = sent[j]
                if word in terms_to_idxs:
                    term_idx = terms_to_idxs[word]
                    sent_terms.append((term_idx, word.split('_'), j))
            # print('doc-id: {}, sent-terms: {}'.format(doc_id, sent_terms))
            if sent_terms:
                # prepped_sent, term_idxs = prepare_sentence(sent, sent_terms)
                # print('prepared_sent: {}, term_idxs: {}'.format(prepped_sent,
                #                                                 term_idxs))
                # print('sent:', sent)
                assert isinstance(sent, list)
                assert isinstance(sent[0], str)
                embs = elmo.get_embeddings(sent, mode=1)
                for k in range(len(sent_terms)):
                    # term_emb = get_term_emb(embs, term_idxs[h])
                    # term_emb = [float(f) for f in embs[term[1]]]
                    term_idx_in_sent = sent_terms[k][2]
                    term_emb = embs[term_idx_in_sent]
                    term_idx = sent_terms[k][0]
                    if term_idx not in term_embs_per_doc:
                        term_embs_per_doc[term_idx] = {}
                    if doc_id not in term_embs_per_doc[term_idx]:
                        term_embs_per_doc[term_idx][doc_id] = []
                    term_embs_per_doc[term_idx][doc_id].append(term_emb)

        if i % 5000 == 0:
            fpath = path_out + str(dump_counter)
            print('Write embeddings to file at: {}...'.format(fpath))
            with open(fpath, 'wb') as f:
                pickle.dump(term_embs_per_doc, f)
            print('Finished writing embeddings.')
            term_embs_per_doc = {}
            dump_counter += 1

    fpath = path_out + str(dump_counter)
    print('Write embeddings to file at: {}...'.format(fpath))
    with open(fpath, 'wb') as f:
        pickle.dump(term_embs_per_doc, f)
    print('Finished writing embeddings.')
    term_embs_per_doc = {}

    output.put('Done')