Exemple #1
0
def get_word2vec_and_vocab_part_2():
    allennlp_reader = TextCatReader(
        word_tokenizer=WordTokenizer(word_filter=PassThroughWordFilter()),
        segment_sentences=True)
    label_iterator = LabelIterator(allennlp_reader,
                                   filepaths_of_data_to_train_on)
    make_label_ind_files(label_iterator)
Exemple #2
0
 def __init__(
     self,
     model_folder_name: str,
     token_indexers: Dict[str, TokenIndexer] = None,
     word_tokenizer: Tokenizer = None,
     segment_sentences: bool = False,
     lazy: bool = False,
     column_titles_to_index: List[str] = ("tokens", )
 ) -> None:
     super().__init__(lazy=lazy)
     if model_folder_name.endswith('/'):
         model_folder_name = model_folder_name[:-1]
     if '/' in model_folder_name:
         model_folder_name = model_folder_name[model_folder_name.
                                               rfind('/') + 1:]
     self.model_folder_name = model_folder_name
     self._word_tokenizer = word_tokenizer or WordTokenizer(
         word_filter=PassThroughWordFilter())
     self._segment_sentences = segment_sentences
     self._token_indexers = token_indexers or {
         'tokens': SingleIdTokenIndexer()
     }
     self._column_titles_to_index = column_titles_to_index
     assert len(self._column_titles_to_index) > 0
     if self._segment_sentences:
         self._sentence_segmenter = SentenceTokenizer()
Exemple #3
0
 def __init__(
     self,
     word_splitter: WordSplitter = SimpleWordSplitter(),
     word_filter: WordFilter = PassThroughWordFilter(),
     word_stemmer: WordStemmer = PassThroughWordStemmer()
 ) -> None:
     self.word_splitter = word_splitter
     self.word_filter = word_filter
     self.word_stemmer = word_stemmer
Exemple #4
0
 def __init__(self,
              word_splitter: WordSplitter = None,
              word_filter: WordFilter = PassThroughWordFilter(),
              word_stemmer: WordStemmer = PassThroughWordStemmer(),
              start_tokens: List[str] = None,
              end_tokens: List[str] = None) -> None:
     self._word_splitter = word_splitter or JiebaSplitter()
     self._word_filter = word_filter
     self._word_stemmer = word_stemmer
     self._start_tokens = start_tokens or []
     self._start_tokens.reverse()
     self._end_tokens = end_tokens or []
 def __init__(self,
              word_splitter: WordSplitter = None,
              word_filter: WordFilter = PassThroughWordFilter(),
              word_stemmer: WordStemmer = PassThroughWordStemmer(),
              start_tokens: List[str] = None,
              end_tokens: List[str] = None) -> None:
     self._word_splitter = word_splitter or SpacyWordSplitter()
     self._word_filter = word_filter
     self._word_stemmer = word_stemmer
     self._start_tokens = start_tokens or []
     # We reverse the tokens here because we're going to insert them with `insert(0)` later;
     # this makes sure they show up in the right order.
     self._start_tokens.reverse()
     self._end_tokens = end_tokens or []
 def __init__(self,
              token_indexers: Dict[str, TokenIndexer] = None,
              word_tokenizer: Tokenizer = None,
              segment_sentences: bool = False,
              lazy: bool = False,
              column_titles_to_index: List[str] = ("tokens", )) -> None:
     super().__init__(lazy=lazy)
     self._word_tokenizer = word_tokenizer or WordTokenizer(word_filter=PassThroughWordFilter())
     self._segment_sentences = segment_sentences
     self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
     self._column_titles_to_index = column_titles_to_index
     assert len(self._column_titles_to_index) > 0
     if self._segment_sentences:
         self._sentence_segmenter = SentenceTokenizer()        
Exemple #7
0
def get_word2vec_and_vocab_part_1():
    first_data_filepath = filepaths_of_data_to_train_on[0]
    allennlp_reader = TextCatReader(
        word_tokenizer=WordTokenizer(word_filter=PassThroughWordFilter()),
        segment_sentences=True)
    sentence_iterator = GensimSentenceIterator(allennlp_reader,
                                               filepaths_of_data_to_train_on)
    words_to_inds, num_words_in_vocab_including_unk, total_num_sentences, vocab_filename_nonums = \
        get_vocab_and_collect_num_sentences(sentence_iterator, first_data_filepath)
    sentence_iterator.valid_vocab_words = words_to_inds
    if write_temp_file_of_all_provided_data_with_set_vocab:
        sentence_iterator, temp_data_filename = make_temp_file_and_iterate_over_that_instead(
            sentence_iterator)
        words_to_inds = None

    label_iterator = LabelIterator(allennlp_reader,
                                   filepaths_of_data_to_train_on)
    make_label_ind_files(label_iterator)
 def __init__(self,
              word_splitter: WordSplitter = None,
              word_filter: WordFilter = PassThroughWordFilter(),
              word_stemmer: WordStemmer = PassThroughWordStemmer(),
              start_tokens: List[str] = None,
              end_tokens: List[str] = None,
              in_between_tokens: List[str] = None):
     self.tokenizer = WordTokenizer(word_splitter=word_splitter,
                                    word_filter=word_filter,
                                    word_stemmer=word_stemmer)
     self.start_tokens = start_tokens or []
     self.start_tokens = [Token(token) for token in self.start_tokens]
     self.end_tokens = end_tokens or []
     self.end_tokens = [Token(token) for token in self.end_tokens]
     self.in_between_tokens = in_between_tokens or []
     self.in_between_tokens = [
         Token(token) for token in self.in_between_tokens
     ]
Exemple #9
0
def get_word2vec_and_vocab_part_3():
    first_data_filepath = filepaths_of_data_to_train_on[0]
    vocab_filename_nonums = first_data_filepath[:first_data_filepath.rfind('.')] + \
                            vocabword_ind_not_numbered_file_ending
    num_words_in_vocab_including_unk = 0
    with open(vocab_filename_nonums, 'r') as f:
        for line in f:
            if line.strip() == '':
                continue
            num_words_in_vocab_including_unk += 1
    print(
        str(num_words_in_vocab_including_unk) +
        " words in vocab, including unk token.")
    if write_temp_file_of_all_provided_data_with_set_vocab:
        temp_data_filename = first_data_filepath + ".temp"
        sentence_iterator = SimpleIterator([temp_data_filename])
        total_num_sentences = -1
        with open(temp_data_filename, 'r') as f:
            for line in f:
                if line.strip() == '':
                    continue
                total_num_sentences += 1
    else:
        allennlp_reader = TextCatReader(
            word_tokenizer=WordTokenizer(word_filter=PassThroughWordFilter()),
            segment_sentences=True)
        sentence_iterator = GensimSentenceIterator(
            allennlp_reader, filepaths_of_data_to_train_on)
        total_num_sentences = 0
        for sentence in iter(sentence_iterator):
            total_num_sentences += 1
    sentence_iterator.total_num_sentences = total_num_sentences

    print("Starting to train model.")

    trained_model = Word2Vec(None,
                             iter=iterations_for_training_word_embeddings,
                             min_count=0,
                             size=word_embedding_dimension,
                             workers=4)
    trained_model.build_vocab(sentence_iterator)
    trained_model.train(sentence_iterator,
                        total_examples=total_num_sentences,
                        epochs=iterations_for_training_word_embeddings)
    temp_filename = (first_data_filepath[:first_data_filepath.rfind('.')] +
                     "_tempgensim")
    trained_model.save(temp_filename)

    print("Starting to move trained embeddings into numpy matrix at " +
          str(datetime.datetime.now()))
    np_embedding_filename = (
        first_data_filepath[:first_data_filepath.rfind('.')] +
        embedding_file_tag + ".npy")
    num_vocab_words = num_words_in_vocab_including_unk - 1
    embedding_matrix = np.zeros(
        (num_vocab_words + 2, word_embedding_dimension))
    ind_counter = 1
    with open(vocab_filename_nonums, 'r') as f:
        for line in f:
            if line.strip() == '':
                continue
            line = line[:-1]  # get rid of newline
            embedding_matrix[ind_counter] = trained_model[line]
            ind_counter += 1
    embedding_matrix[num_words_in_vocab_including_unk] = trained_model[
        unk_token]
    norm_of_embeddings = np.linalg.norm(embedding_matrix, axis=1)
    norm_of_embeddings[norm_of_embeddings == 0] = 1e-13
    embedding_matrix = embedding_matrix / norm_of_embeddings[:, None]
    np.save(np_embedding_filename, embedding_matrix)

    print(
        "Starting to save numpy matrix as hdf5 file containing torch tensor at "
        + str(datetime.datetime.now()))
    hdf5_filename = (first_data_filepath[:first_data_filepath.rfind('.')] +
                     embedding_file_tag + ".h5")
    with h5py.File(hdf5_filename, "w") as f:
        dset = f.create_dataset(
            "embedding",
            (num_words_in_vocab_including_unk + 1, word_embedding_dimension),
            dtype='f')
        dset[...] = embedding_matrix

    print("Removing temporary gensim model files at " +
          str(datetime.datetime.now()))
    # remove temp gensim model files, now that embedding matrix has been saved
    if os.path.isfile(temp_filename):
        os.remove(temp_filename)
    other_files_to_rm = glob(temp_filename + ".*")
    for fname in other_files_to_rm:
        os.remove(fname)
    if write_temp_file_of_all_provided_data_with_set_vocab:
        if os.path.isfile(temp_data_filename):
            os.remove(temp_data_filename)
def add_info_about_num_sents_and_max_num_tokens_to_data_file(data_filename):
    print("Starting to calculate information about number of sentences and max number of tokens for each " +
          "instance to add to " + data_filename)
    numsents_maxnumtokens = []
    numsents_maxnumtokens_if_not_considering_stop_words = []
    allennlp_formatted_reader = TextCatReader(word_tokenizer=WordTokenizer(word_filter=PassThroughWordFilter()),
                                              segment_sentences=True)
    pass_through_word_filter = PassThroughWordFilter()
    stop_word_filter = StopwordFilter()

    first_line = True
    num_instances_passed = 0
    with open(data_filename, 'r') as f:
        for line in tqdm(f):
            if first_line:
                # find which field is tokens
                temp_line = line
                tokens_field_ind = 0
                while not (temp_line.startswith('tokens\t') or temp_line.startswith('tokens\n')):
                    temp_line = temp_line[temp_line.index('\t') + 1:]
                    tokens_field_ind += 1
                first_line = False
            else:
                if line.strip() == '':
                    continue
                text_field = get_nth_field_in_line(line, tokens_field_ind)

                num_instances_passed += 1
                if len(text_field) == 0:
                    # reader will skip this, so its tuple is (0, 0)
                    numsents_maxnumtokens.append((0, 0))
                    numsents_maxnumtokens_if_not_considering_stop_words.append((0, 0))
                    continue

                allennlp_formatted_reader._word_tokenizer._word_filter = pass_through_word_filter
                instance = allennlp_formatted_reader.text_to_instance(tokens=text_field, category='placeholder')
                if instance is None:
                    # reader will skip this, so its tuple is (0, 0)
                    # and since the stop word version is even MORE restrictive, it would be skipped there too
                    numsents_maxnumtokens.append((0, 0))
                    numsents_maxnumtokens_if_not_considering_stop_words.append((0, 0))
                    continue
                list_of_sentences = instance.fields['tokens'].field_list
                num_sents = len(list_of_sentences)
                max_num_tokens = 0
                for sentence_as_text_field in list_of_sentences:
                    list_of_tokens = sentence_as_text_field.tokens
                    if len(list_of_tokens) > max_num_tokens:
                        max_num_tokens = len(list_of_tokens)
                numsents_maxnumtokens.append((num_sents, max_num_tokens))

                allennlp_formatted_reader._word_tokenizer._word_filter = stop_word_filter
                instance = allennlp_formatted_reader.text_to_instance(tokens=text_field,
                                                                         category='placeholder')
                if instance is None:
                    # reader will skip this, so its tuple is (0, 0)
                    numsents_maxnumtokens_if_not_considering_stop_words.append((0, 0))
                    continue
                list_of_sentences = instance.fields['tokens'].field_list
                num_sents = len(list_of_sentences)
                max_num_tokens = 0
                for sentence_as_text_field in list_of_sentences:
                    list_of_tokens = sentence_as_text_field.tokens
                    if len(list_of_tokens) > max_num_tokens:
                        max_num_tokens = len(list_of_tokens)
                numsents_maxnumtokens_if_not_considering_stop_words.append((num_sents, max_num_tokens))

    assert len(numsents_maxnumtokens) == num_instances_passed
    assert len(numsents_maxnumtokens_if_not_considering_stop_words) == num_instances_passed

    temp_full_filename = data_filename[:data_filename.rfind('.')] + "_temp.tsv"
    old_f = open(data_filename, 'r')
    first_line = True
    instance_counter = 0
    with open(temp_full_filename, 'w') as f:
        for line in old_f:
            if first_line:
                old_line = line[:line.rfind('\n')]

                new_line = (old_line + '\t' +
                            'num_sentences_post_stopword_removal' + '\t' +
                            'max_num_tokens_in_sentence_post_stopword_removal' + '\t' +
                            'num_sentences' + '\t' +
                            'max_num_tokens_in_sentence' + '\n')
                f.write(new_line)
                first_line = False
            else:
                if line.strip() == '':
                    continue
                str_num_sents_no_stopwords = str(
                    numsents_maxnumtokens_if_not_considering_stop_words[instance_counter][0])
                str_max_num_tokens_no_stopwords = str(
                    numsents_maxnumtokens_if_not_considering_stop_words[instance_counter][1])
                str_num_sents = str(numsents_maxnumtokens[instance_counter][0])
                str_max_num_tokens = str(numsents_maxnumtokens[instance_counter][1])
                old_line = line[:line.rfind('\n')]

                new_line = (old_line + '\t' +
                            str_num_sents_no_stopwords + '\t' +
                            str_max_num_tokens_no_stopwords + '\t' +
                            str_num_sents + '\t' +
                            str_max_num_tokens + '\n')

                f.write(new_line)
                instance_counter += 1
    old_f.close()

    if os.path.isfile(data_filename):
        os.remove(data_filename)
    os.rename(temp_full_filename, data_filename)
class InstanceLenGenerator:
    def __init__(self, allennlp_formatted_reader, filepaths):
        self.allennlp_formatted_reader = allennlp_formatted_reader
        self.filepaths = filepaths

    def __iter__(self):
        for filepath in self.filepaths:
            for instance in tqdm(
                    self.allennlp_formatted_reader._read(file_path=filepath)):
                instance_as_text_field = instance.fields['tokens']
                yield len(instance_as_text_field.tokens)


allennlp_reader = TextCatReader(
    word_tokenizer=WordTokenizer(word_filter=PassThroughWordFilter()),
    segment_sentences=False)
len_generator = InstanceLenGenerator(allennlp_reader, filenames_to_use)
all_lengths = []
for length in iter(len_generator):
    all_lengths.append(length)

arr_of_lengths = np.array(all_lengths)
m = np.mean(arr_of_lengths)
sd = np.std(arr_of_lengths)
with open(write_to_filename, 'w') as f:
    f.write('Mean: ' + str(m) + '\n')
    f.write('SD:   ' + str(sd) + '\n')

print("Done calculating word stats.")