class MySentences(object):
    """lines -> sentence list(['...', '...',...]) ->
    words list([[[' '],[' ']],[[' '],[' ']],...) ->
    phrase([['a_b','c_d']])"""
    def __init__(self, dirname, common_terms):
        self.dirname = dirname
        self.bigram = Phrases(min_count=2,
                              threshold=5,
                              common_terms=common_terms)

    def __iter__(self):
        for root, dirs, files in os.walk(self.dirname):
            for filename in files:
                file_path = root + '/' + filename
                with open(file_path, 'rb') as f:
                    # read all lines in the file as a list
                    readlines = f.readlines()

                sentence_stream = [parse_sent(doc) for doc in readlines]
                self.bigram.add_vocab(sentence_stream)
                # bigram = Phrases(
                #     sentence_stream,
                #     min_count=2,
                #     threshold=5,
                #     common_terms=common_terms)
                sentence_stream = list(self.bigram[sentence_stream])
                for sent in sentence_stream:
                    yield sent
def load_shit(file_paths, save_path):
    for i, path in enumerate(file_paths):
        # first iteration
        if i == 0:
            print('[info] initializing phrase model')
            with open(path) as f:
                reader = ndjson.reader(f)
                reader = extract_text(reader)
                # initialize phrase model
                phrases = Phrases(reader, delimiter=b" ")

        # every other iteration
        else:
            if i % 1000:
                progress = (i / len(file_paths)) * 100
                print('[info] processed {}% files'.format(round(progress, 1)))

            with open(path) as f:
                reader = ndjson.reader(f)
                reader = extract_text(reader)
                # show the model new data
                phrases.add_vocab(reader)

    # save model after iterations are done
    with open(save_path, 'w') as f:
        phrases.save(save_path)
Esempio n. 3
0
def trainSOPhrase(g_DataQueue, g_FinishRead, savePath, priorPhrasePath):
    """

    :param g_DataQueue:全局变量存放数据库中的数据
    :param g_FinishRead:是否读取完数据库的标志
    :param savePath:短语学习器保存的位置
    :param priorPhrasePath:前一个学习器保存的位置
    :return:
    """
    count = 0
    phrase = Phrases(None, min_count=10, threshold=15)
    if (priorPhrasePath is None):
        priorPhraser = None
    else:
        priorPhraser = Phraser(Phrases.load(priorPhrasePath))
    while (g_FinishRead.value == 0 or (not g_DataQueue.empty())):
        data = g_DataQueue.get()
        count += len(data)
        print("have processed:", count)
        words = []
        reSub0 = re.compile(
            "(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]"
        )  # URL
        reSub1 = re.compile(
            "[()\"{},:/-]|[^a-z]'|'[^a-z;?.!]|'$")  # replace with " "
        reSub2 = re.compile(
            "'[.?;!]")  # replace with . 主要考虑所有格问题,核心思想单引号左右的各种复杂情况
        reSplit1 = re.compile("\.[^a-z0-9]|[?!;]")
        # 获取单词
        for t in data:
            if (t[0] is not None):
                st = re.sub(reSub0, " ", t[0].lower())
                st = re.sub(reSub1, ".", st)
                st = re.sub(reSub2, ".", st)
                for sentence in re.split(reSplit1, st):
                    sen_word = sentence.split()
                    if (len(sen_word) > 6):
                        words.append(sen_word)
            if (t[1] is not None):
                st = re.sub(reSub0, " ", t[1].lower())
                st = re.sub(reSub1, ".", st)
                st = re.sub(reSub2, ".", st)
                for sentence in re.split(reSplit1, st):
                    sen_word = sentence.split()
                    if (len(sen_word) > 6):
                        words.append(sen_word)
        del data
        gc.collect()
        # 训练短语
        if (priorPhraser is None):  # 第一次训练
            phrase.add_vocab(words)
        else:  # 已经训练过一次,寻找个数更多的短语
            phrase.add_vocab(priorPhraser[words])
        del words
        # print(len(phrase.vocab))
        gc.collect
    phrase.save(savePath)
    def update(self,new_corpus,count,wrkers,sze,wndow):
        sentences = Corpus_Sentence_Extractor(new_corpus)

        bigram = Phrases().load(self.models + 'bigram_model')
        trigram = Phrases().load(self.models + 'trigram_model')

        bigram.add_vocab(sentences)
        trigram.add_vocab(bigram[sentences])

        self.train(sentences,trigram,self.word2vec,count,wrkers,sze,wndow)
Esempio n. 5
0
def trainPhrase(g_DataQueue, g_FinishRead, savePath, priorPhrasePath):
    count = 0
    phrase = Phrases(None, min_count=15, threshold=10, max_vocab_size=40000000)
    if (priorPhrasePath is None):
        priorPhraser = None
    else:
        priorPhraser = Phraser(Phrases.load(priorPhrasePath))
    while (g_FinishRead.value == 0 or (not g_DataQueue.empty())):
        words = g_DataQueue.get()
        if (priorPhraser is None):  # 第一次训练
            phrase.add_vocab(words)
        else:  # 已经训练过一次,寻找个数更多的短语
            phrase.add_vocab(priorPhraser[words])
        del words
        gc.collect()
    phrase.save(savePath)
Esempio n. 6
0
def salient_bigrams(phrases: Phrases):
    """Finds the most salient bigrams

    Args:
        phrases (Phrases): Phrases class set up for bigram search
    """
    for slice in read_corpus():
        phrases.add_vocab(read_slice(slice))

        # evaluate all previous corpus slices
        found = set()
        total_bigrams_encountered = 0
        for previous_slice in read_corpus():
            for phrase, score in phrases.export_phrases(
                    read_slice(previous_slice)):
                found.add((phrase, score))
                total_bigrams_encountered += 1
            if previous_slice == slice:
                break

        found = sorted(found, key=lambda element: element[1], reverse=True)

        # no bigrams found?
        if len(found) == 0:
            output(slice, "")

        # log the top ten bigrams
        for phrase, score in found[:10]:
            output(slice, "{phrase}, {score}".format(phrase=phrase,
                                                     score=score))

        # log the total counts
        output(
            slice, """
Total bigrams: {total}
Unique bigrams: {unique}
Median score:{median}
Max score:{max}
Min score:{min}
""".format(total=total_bigrams_encountered,
           unique=len(found),
           median=found[len(found) // 2] if len(found) != 0 else 0,
           max=found[0] if len(found) != 0 else 0,
           min=found[-1]) if len(found) != 0 else 0)

        # will log a time if command line args were enabled
        Timer.try_to_time()
Esempio n. 7
0
    else:
        nlp = None

    # TODO:
    # 1. Create n-gram features for full text similarity search
    #   This will create a problem of two documents being similar
    #   everything but the subject of research. We want documents
    #   to be similar in subject of research
    # 2. Need to reliably determine the topic. Seems hard to solve
    #   this in ad-hoc fashion. Need to train LDA on paper abstracts
    #   first.

    if nlp is not None:
        tokenized = nlp(full_text)

        phrases_model.add_vocab([token for token, pos in s] for s in tokenized)
        phrase_voc.add_documents(nlp.chunks(t) for t in tokenized)

    print("\n")

#%%

pickle.dump(phrases_model, open("gensim_phrase.pkl", "wb"))
pickle.dump(phrase_voc, open("gensim_chunk_dict.pkl", "wb"))

#%%

common_dict = Dictionary()

for loc, filecontent in get_files_content(files):
Esempio n. 8
0
seg = Segmenter()
# vocab = Dictionary()
phrases = Phrases()

text_path = sys.argv[1]

def get_data(text_path):

    for line in open(text_path, "r"):
        line = line.strip()

        if line:
            data = json.loads(line)

            yield data['abstract']

for ind, text in enumerate(get_data(text_path)):
    segments = seg(text, segment_len=1, segment_overlap=0)

    phrases.add_vocab(segments)
    # vocab.add_documents(segments, prune_at=2000000)

    if ind % 10000:
        print(f"\rProcessed:{ind}", end = "")
        break

# vocab.filter_extremes(no_below=5, no_above=0.5, keep_n=2000000)
# vocab.save("academic.dict")

phrases.save("academic.phrases")
Esempio n. 9
0
def salient_trigrams(phrases: Phrases):
    """Finds the most salient trigrams

    Args:
        phrases (Phrases): Phrases class set up for bigram search
    """
    trigram = Phrases()

    for slice in read_corpus():
        # prepare the bigram
        for previous_slice in read_corpus():
            phrases.add_vocab(read_slice(slice))
            if previous_slice == slice:
                break

        # transform sentences into possible bigrams
        bigram_phraser = Phraser(phrases)

        def bigrammed(slice: str):
            for sent in read_slice(slice):
                yield bigram_phraser[sent]

        trigram.add_vocab(bigrammed(slice))

        # evaluate all previous corpus slices
        found = set()
        total_trigrams_encountered = 0
        for previous_slice in read_corpus():
            for phrase, score in trigram.export_phrases(
                    bigrammed(previous_slice)):
                if phrase.count(b'_') == 2:
                    found.add((phrase, score))
                    total_trigrams_encountered += 1
                elif '_' in phrase:
                    print(phrase)
            if previous_slice == slice:
                break

        found = sorted(found, key=lambda element: element[1], reverse=True)

        # no trigrams found?
        if len(found) == 0:
            output(slice, "")

        # log the top ten trigrams
        for phrase, score in found[:10]:
            output(slice, "{phrase}, {score}".format(phrase=phrase,
                                                     score=score))

        # log the total counts
        output(
            slice, """
Total trigrams: {total}
Unique trigrams: {unique}
Mean score:{median}
Max score:{max}
Min score:{min}
""".format(total=total_trigrams_encountered,
           unique=len(found),
           median=found[len(found) // 2] if len(found) != 0 else 0,
           max=found[0] if len(found) != 0 else 0,
           min=found[-1] if len(found) != 0 else 0))

        # will log a time if command line args were enabled
        Timer.try_to_time()
Esempio n. 10
0
class GensimTokenizer:
    def __init__(self, dictionary, phraser=None, patch_dict=PATCH_DICT):
        """Wrap a Gensim Dictionary, phrase detector, and special tokens for creating tokenization from OWT

        Args:
            dictionary: The gensim dictionary mapping vocabulary to IDs and back
            phraser: If provided, use gensim's phrase detector to lump common concepts together
            patch_dict: Patch the dictionary with special tokens
        """
        self.dictionary = dictionary
        self.phraser = Phrases([[]]) if phraser is None else phraser
        self.patch_dict = patch_dict

    @classmethod
    def from_file(cls, dict_fname, phraser_fname=None):
        """Load tokenizer information from a dictionary file (generated by gensim dictionary.save) and a phraser file."""
        d = Dictionary.load(str(dict_fname))
        if phraser_fname is not None:
            p = Phraser.load(phraser_fname)
        else:
            p = Phraser(Phrases([[]]))

        return cls(d, p)

    def add_document_from_fname(self, fname):
        """For training, add the contents of a text file to the dictionary"""
        print(f"Adding {fname}")
        tokens = self.phraser[file2tokens(fname)]
        self.dictionary.add_documents(tokens)

    def add_to_phraser_from_fname(self, fname):
        """Detect common phrases from fname for bigramming purposes"""
        print(f"Adding {fname} to phraser")
        tokens = file2tokens(fname)
        self.phraser.add_vocab(tokens)

    def get_dictionary(self):
        return self.dictionary

    def token2id(self, word):
        """Convert a token into an id, converting to UNK ID as necessary"""
        d = self.dictionary
        return d.token2id.get(word, d.token2id["<UNK>"])

    def tokens2ids(self, tokens):
        """Convert a list of tokens into ids, converting to UNK as necessary"""
        return [self.token2id(tok) for tok in tokens]

    def tokenize(self, s: str):
        """Convert a sentence into its tokens"""
        return self.phraser[process_line(s)[0]]

    def tokenize_batch(self, lines: List[str]):
        """Convert a batch of lines into their tokens"""
        return self.phraser[[process_line(line)[0] for line in lines]]

    def encode(self, s):
        """Encode a single sentence into IDs"""
        sent_tokens = self.tokenize(s)
        return self.tokens2ids(sent_tokens)

    def decode(self, ids):
        """Alias for `ids2tokens`"""
        return self.ids2tokens(ids)

    def id2token(self, id):
        """Convert an id to a token"""
        d = self.dictionary
        if id == -1: return "<STOPWRD>"  # Account for post processing
        return d[id]  # Add error handling if bad id

    def ids2tokens(self, ids):
        """Convert iterable of ids to tokens"""
        return [self.id2token(id) for id in ids]

    def set_outdir(self, outdir):
        """Useful when training in parallel. If set, will save contents to outdir"""
        self.outdir = Path(outdir)

    def patch(self, vocab_size, new_vocab, no_below=15, no_above=0.8):
        """Patch the tokenizer with a manually specified list of tokens, after training"""

        print("Patching with special tokens...")
        self.dictionary.patch_with_special_tokens(self.patch_dict)
        print("Filtering vocabulary...")
        self.dictionary.filter_extremes(no_below=no_below,
                                        no_above=no_above,
                                        keep_n=vocab_size)

        print(f"Adding {len(new_vocab)} new words to dictionary...")
        new_vocab = self.tokenize_batch(new_vocab)
        self.dictionary.add_documents(new_vocab)
        print(f"Done patching. New vocab size = {self.n_vocab()}")
        return new_vocab

    def save(self, outfile):
        self.dictionary.save(outfile)

    def n_vocab(self):
        return len(self.vocab)

    @cached_property
    def vocab(self):
        return list(self.dictionary.keys())

    @cached_property
    def token_vocab(self):
        return list(self.dictionary.values())

    def __len__(self):
        return self.n_vocab()

    def encode_sentences_from_fname(self, fname):
        """Tokenize all the sentences from a text file"""
        outlist = []
        ind_offsets = []
        new_start = 0

        with open(fname, 'r') as fp:
            for line in fp.readlines():
                if is_good_line(line):
                    sents = self.phraser[process_line(line)]
                    for sent in sents:
                        ids = self.tokens2ids(sent)
                        outlist += ids
                        new_start = new_start + len(ids)
                        ind_offsets.append(new_start)

        return np.asarray(outlist, dtype=np.int32), np.asarray(ind_offsets,
                                                               dtype=np.uint64)

    def encode_and_save_for_mp(self, fname):
        """Save sentences from fname. Needed because a local function can't be used with the MP module"""
        if self.outdir is None: raise ValueError("Please `set_outdir` first")

        fname = Path(fname)

        idarr_outfile = self.outdir / (fname.stem + '.npy')
        ind_offsets_outfile = self.outdir / (fname.stem + '_offsets.npy')
        idarr, ind_offsets = self.encode_sentences_from_fname(fname)
        np.save(idarr_outfile, idarr)
        np.save(ind_offsets_outfile, ind_offsets)