def merge_categories_to_whole_set(path):
    COUNT = 1
    name_list = get_namelist(path)
    init_catogery = 'null'
    list_tmp_to_store_sample = []
    output = []
    for i, line in enumerate(name_list):
        # print(i)
        classification = classification_extract(line[0])
        file = line[1]
        if classification != init_catogery :
            list_tmp_to_store_sample_tmp = list_tmp_to_store_sample
            # random.shuffle(list_tmp_to_store_sample)
            list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample) #random.shuffle
            list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample)
            # list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample)
            # list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample)
            # list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample)
            # list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample)
            # list_tmp_to_store_sample_tmp.remove(None)
            # list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample)
            if COUNT > 1 :
                # list_tmp_to_store_sample_tmp.remove(None)
                output.append(d2v.TaggedDocument(list_tmp_to_store_sample_tmp, [init_catogery]))
            COUNT += 1
            list_tmp_to_store_sample_tmp = []
            init_catogery = classification
            list_tmp_to_store_sample = []
        # print(init_catogery,classification)
        if classification == init_catogery:
            with open(file, 'r',encoding="utf-8") as f:
                try:
                    contents = f.readline()
                    # For training data, add tags
                    line_split = contents.split()
                    if len(line_split) > 1 :
                        list_tmp_to_store_sample.extend(line_split)
                    else:
                        pass
                    continue
                except Exception as e:
                    print(e)
                    continue
    list_tmp_to_store_sample_tmp = list_tmp_to_store_sample
    # random.shuffle(list_tmp_to_store_sample)
    list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample)
    list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample)
    # list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample)
    # list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample)
    # list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample)
    # list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample)
    # list_tmp_to_store_sample_tmp.remove(None)
    # list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample)
    # list_tmp_to_store_sample_tmp.remove(None)
    output.append(d2v.TaggedDocument(list_tmp_to_store_sample_tmp, [init_catogery]))
    return output
Beispiel #2
0
 def __iter__(self):
     document_id = 0
     for file_ind, file_name in enumerate(self.files_list):
         try:
             fasta_sequences = SeqIO.parse(
                 _open(os.path.join(self.input_folder, file_name)), 'fasta')
             seq_id = 0
             for fasta in fasta_sequences:
                 seq_id += 1
                 name, sequence = fasta.id, str(fasta.seq)
                 documents_list = self._get_document_from_fasta(
                     sequence, self.processing_mode, self.k,
                     self.shift_size)
                 for doc_ind, doc in enumerate(documents_list):
                     yield doc2vec.TaggedDocument(doc, [document_id])
                 # Use same document_id for all sequences if non-overlapping
                 document_id += 1
             if file_ind % 1 == 0:
                 print(
                     f"Finished processing file #{file_ind}, file_name:{file_name}, number of genes: {seq_id} document_id: {document_id}"
                 )
         except Exception as e:
             print(
                 f"****ERROR IN PARSING file: {file_name}, seq_id: {seq_id},"
             )
             print(f"name: {name}  sequence: {sequence}")
             print(f"Error message: {e}")
Beispiel #3
0
def tokenize(df, col, tokens_only=False):
	"""
	Given a DataFrame and a column, tokenizes the words in that column

	Parameters
	----------
	df: DataFrame
		dataframe with column to be tokenized
	col: str
		column name of text to be tokenized
	tokens_only: bool
		to train the doc2vec model, we’ll need to 
		associate a tag/number with each document of the training corpus. 
		tokens_only=True means don't associate anything
	Returns
	----------
	list
		tokenized words
	"""
	tokens = df[col].apply(lambda x: simple_preprocess(x, deacc=True, max_len=20)) # max_len=20 just in case there are important words 15 chars long)
	if tokens_only:
		return tokens
	else:
		# For training data, add tags -- notice it is just an index number
		return [doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(tokens)]
Beispiel #4
0
def preprocess_lines(pdf_content, document_tag):
    """Preprocesses a string of words. For the moment, the step are:
        - removes everything but letters
        - splits into lines
        - converts to lower case
        - splits line into words
        - takes stems of words
        - takes out one and two caracter words
        - takes out stop words (provided by nltk)
    """

    # take out numbers (for now)
    letters_only = re.sub("[^a-zA-Z\n]", " ", pdf_content)
    line_list = letters_only.split('\n')

    sentence_list = []
    stem_dict = {}
    for line in line_list:
        words = line.lower().split()
        # stem words
        meaningful_words, stem_dict_line = stem_words(words)
        stem_dict.update(stem_dict_line)
        # take out one and two character words
        meaningful_words = [w for w in meaningful_words if len(w) > 2]
        # take out stop words
        meaningful_words = [w for w in meaningful_words if w not in stops]

        if len(meaningful_words) != 0:
            sentence_list.extend(meaningful_words)

    if sentence_list != []:
        tagged_document = doc2vec.TaggedDocument(sentence_list, tags=[document_tag])
        return tagged_document, stem_dict
    else:
        return None, stem_dict
 def __iter__(self):
     document_id = 0
     for file_ind, file_name in enumerate(self.files_list):
         try:
             fasta_sequences = SeqIO.parse(
                 _open(os.path.join(self.input_folder, file_name)), 'fasta')
             seq_id = 0
             for fasta in fasta_sequences:
                 x = random.random()
                 # if x <= 0.5:
                 #     continue
                 seq_id += 1
                 name, sequence = fasta.id, str(fasta.seq)
                 documents_list = self._get_document_from_fasta(sequence)
                 for doc_ind, doc in enumerate(documents_list):
                     yield doc2vec.TaggedDocument(doc, [document_id])
             if file_ind % 1 == 0:
                 print(
                     f"Finished processing file #{file_ind}, file_name:{file_name.replace('.fna.gz', '')}, number of genes: {seq_id} document_id: {document_id}"
                 )
         except Exception as e:
             print(
                 f"****ERROR IN PARSING file: {file_name}, seq_id: {seq_id},"
             )
             print(f"name: {name}  sequence: {sequence}")
             print(f"Error message: {e}")
         document_id += 1
Beispiel #6
0
def train_doc2_vec(
    data_path,
    embed_size=64,
    epoch=10,
    min_count=1,
    window=4,
    workers=8,
    model_path="word2vec_entities_raw_agg_user.model",
):

    df_all = pd.read_csv(data_path)
    df_all.fillna("null", inplace=True)
    logging.info(f"Load All Data: {df_all.shape}")
    logging.info(f"Build Training Corpus for Doc2Vec")
    start = time.time()
    ex_entities = df_all["ex_entities"].tolist()
    train_corpus = [
        doc.TaggedDocument(word, [idx]) for idx, word in enumerate(ex_entities)
    ]
    logging.info(f"Train Doc2Vec model")
    model = doc.Doc2Vec(
        vector_size=embed_size,
        min_count=min_count,
        epochs=epoch,
        window=window,
        workers=workers,
    )
    model.build_vocab(train_corpus)

    model.train(train_corpus,
                total_examples=model.corpus_count,
                epochs=model.epochs)

    logging.info("finished ({:.2f} sec elapsed)".format(time.time() - start))
    model.save(model_path)
Beispiel #7
0
 def test_mixed_tag_types(self):
     """Ensure alternating int/string tags don't share indexes in doctag_syn0"""
     mixed_tag_corpus = [doc2vec.TaggedDocument(words, [i, words[0]]) for i, words in enumerate(raw_sentences)]
     model = doc2vec.Doc2Vec()
     model.build_vocab(mixed_tag_corpus)
     expected_length = len(sentences) + len(model.docvecs.doctags)  # 9 sentences, 7 unique first tokens
     self.assertEquals(len(model.docvecs.doctag_syn0), expected_length)
Beispiel #8
0
def read_data(tag_with_genres=True):
    # read in dataframe
    info_df = pd.read_csv('album_info.csv', index_col=False)

    with open(os.path.join('corpus', filename), 'r') as f:
        # list of [id, 'sentence blahblah']
        data = [line.strip().split('\t') for line in f]
    train_docs = []
    for row in data:
        try:
            id_num = int(row[0])
            split_words = row[1].split(' ')
            id_label = [id_num]

            # also add genre tags in training docs
            if tag_with_genres:
                id_label += get_label(id_num, info_df)

            # add the data
            train_docs.append((split_words, id_label))
        except IndexError:
            print('read_data() IndexError: {}'.format(row))
    tagged_train_docs = [
        doc2vec.TaggedDocument(words=doc, tags=tag) for doc, tag in train_docs
    ]
    return tagged_train_docs
Beispiel #9
0
 def fit(self, X, y=None):
     """Fit the model according to the given training data.
     Parameters
     ----------
     X : {iterable of :class:`~gensim.models.doc2vec.TaggedDocument`, iterable of list of str}
         A collection of tagged documents used for training the model.
     Returns
     -------
     :class:`~gensim.sklearn_api.d2vmodel.D2VTransformer`
         The trained model.
     """
     if isinstance(X[0], doc2vec.TaggedDocument):
         d2v_sentences = X
     else:
         d2v_sentences = [doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(X)]
     self.gensim_model = models.Doc2Vec(
         documents=d2v_sentences, dm_mean=self.dm_mean, dm=self.dm,
         dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count,
         docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment,
         trim_rule=self.trim_rule, vector_size=self.size, alpha=self.alpha, window=self.window,
         min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample,
         seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, hs=self.hs,
         negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn,
         epochs=self.iter, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words
     )
     return self
def label_sentences(corpus, label_type):
    labeled = []
    for i, v in enumerate(corpus):
        doc = " ".join(v)
        label = label_type + '_' + str(i)
        labeled.append(doc2vec.TaggedDocument(doc.split(), [label]))
    return labeled
    def get_filing_from_db(self):
        # read files from DB, clean them and create TaggedDocument
        conn = psycopg2.connect(settings.CONN_STRING)
        words = []
        i = 0
        for i in range(0, self.max_docs):
            pickle_file_name = settings.BASE_PATH_FILINGS + '{}.pickle'.format(
                self.file_list[i])
            if os.path.isfile(pickle_file_name):
                print(
                    'Found pickle file {}, skipping cleaning/tokenization...'.
                    format(pickle_file_name))
                with open(pickle_file_name, 'rb') as f:
                    words = pickle.load(f)
            else:
                cur = conn.cursor()
                cur.execute(
                    'select f.filing, f.date_filed from filings f where f.filing_id = %s',
                    (self.filing_id_list[i], ))
                filing_text = ''
                for record in cur:
                    filing_text = record[0]
                cur.close()

                print('Cleaning {}...'.format(self.file_list[i]))
                words = get_words_from_doc(filing_text, self.tokenizer)

                with open(pickle_file_name, 'wb') as f:
                    pickle.dump(words, f)

            yield (doc2vec.TaggedDocument(words=words,
                                          tags=[self.file_list[i]]))
        conn.close()
Beispiel #12
0
def trainingModel(alljudgements, listOfTopics, sentencesForTraining):
    model = Doc2Vec(dm=1,
                    min_count=1,
                    window=10,
                    vector_size=150,
                    sample=1e-4,
                    negative=10)

    #use all the extracted phrases of all files
    phrases = []
    for line in alljudgements:
        phrases.append(line)
    for line in listOfTopics:
        phrases.append(line)
    for line in sentencesForTraining:
        phrases.append(line)

    sentences = [
        doc2vec.TaggedDocument(sentence, 'tag') for sentence in phrases
    ]
    model.build_vocab(sentences)

    for epoch in range(500):
        model.train(sentences,
                    epochs=model.epochs,
                    total_examples=model.corpus_count)
        seconds = time.time()
        print("Seconds since epoch =", seconds)
        print("Epoch # {} is complete.".format(epoch + 1))
        if (epoch % 30 == 0):
            #save model
            model.save('doc2vec2.model')
Beispiel #13
0
def my_doc2vec_model(doclist):
    reslist = []
    for i, doc in enumerate(doclist):
        blob = TextBlob(doc)
        np = list(blob.noun_phrases)
        reslist.append(doc2vec.TaggedDocument(np, [i]))
    return reslist
def test():
    MyUtils.init_logging("VectorizeDescriptions.log")
    docs_percent_touse = 1  # on the full training set, 0.3 is probably advisable.
    chunk_size = 10 ** 5

    doc_filenames = [F.DESCDOCS] #, F.QADOCS_FILEPATH
    trainingset_ls = []
    for doc_filename in doc_filenames:
        for descdocs_chunk in pd.read_csv(doc_filename, chunksize=chunk_size):
            len_c = len(descdocs_chunk)
            indices = list(sorted(numpy.random.choice(len_c, int(docs_percent_touse * len_c), replace=False)))
            selected_rows = descdocs_chunk.iloc[indices]
            docs = []
            for tupl in selected_rows.itertuples():
                docs.append(D2V.TaggedDocument(words=ast.literal_eval(tupl.words), tags=ast.literal_eval(tupl.tags)))
            trainingset_ls.extend(docs)
            logging.info("Reading in the documents' words. Chunk processed...")
        logging.info("Completed: reading in a set of documents.")

    d2v_model = load_model()

    subset = trainingset_ls[0:5]
    logging.debug("%s", str(subset))
    for doc in  subset:
        tag = doc.tags
        logging.debug("*** : %s" , str(tag))
        logging.debug("XXX : %s" , str(tag[0]))
        logging.debug("%s",str(d2v_model.docvecs[tag[0]]))
Beispiel #15
0
def train_doc2vec_classifier(X1, X2, y, embedd):

    docs_token = [doc.split() for doc in X1]
    documents = [doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(docs_token)]  
    
    # get vector representation of docs
    X_vector = []
    for doc in docs_token:
        vector = embedd.infer_vector(doc)
        X_vector.append(vector)

    X = pd.concat([X2.reset_index(drop=True),pd.DataFrame(X_vector)], axis=1)
    
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=.75, 
                                                        random_state=1)
    
    #oversampling training set
    sm = SMOTE(sampling_strategy = 0.4)
    X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)
    
    lr = LogisticRegression(solver="lbfgs", class_weight='balanced', max_iter=1000)
    lr.fit(X_train_sm, y_train_sm)
    y_hat = lr.predict(X_test)
    
    return lr, y_hat, X_test, y_test
def read_documents_for_doc2vec(tokens_only=False):
    for i, path_content in enumerate(read_files_walk()):
        tokens = utils.preprocess(path_content[1])

        if tokens_only:
            yield path_content[0], tokens
        else:
            yield doc2vec.TaggedDocument(tokens, [i])
Beispiel #17
0
 def create_tagged_documents(self, document_frame):
     permalinks = document_frame.permalink
     authors = document_frame.author
     tokens = document_frame.tokens
     return [
         d2v.TaggedDocument(words=tks, tags=[author + '/' + permalink])
         for tks, author, permalink in zip(tokens, authors, permalinks)
     ]
Beispiel #18
0
def tagNotes(noteDF, noteCol, freqDist, low=4, highPer=0.74):
    highCount = highPer * noteDF.shape[0]
    taggedNotes = []
    for i, note in enumerate(nu.tokenize_and_stop(noteDF, noteCol)):
        clean_note = list(
            filter(lambda x: _check_word(x, freqDist, low, highCount), note))
        taggedNotes.append(dv.TaggedDocument(clean_note, [i]))
    return taggedNotes
Beispiel #19
0
 def testBuildVocabWarning(self, l):
     """Test if logger warning is raised on non-ideal input to a doc2vec model"""
     raw_sentences = ['human', 'machine']
     sentences = [doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(raw_sentences)]
     model = doc2vec.Doc2Vec()
     model.build_vocab(sentences)
     warning = "Each 'words' should be a list of words (usually unicode strings)."
     self.assertTrue(warning in str(l))
Beispiel #20
0
def get_d2v_corpus(corpora):
    for i, line in enumerate(corpora):
        # For training data, add tags
        yield doc2vec.TaggedDocument([
            tkn
            for tkn in line.encode('utf-8').decode('utf-8').lower().split()
            if tkn not in stopwords_en
        ], [i])
    def tagged_sentences(self, sentences):
        tagged_sents = []
        for i, sent in enumerate(sentences):
            tagged_sents.append(d2v.TaggedDocument(sent,
                                                   ["sent_{}".format(i)]))

        print('tagged_sents:', len(tagged_sents))
        return tagged_sents
Beispiel #22
0
 def read_as_tagged_document(self, filename):
     """
     Read in a single document as a TaggedDocument object
     :param filename: The name of the file
     :return: A TaggedDocument object
     """
     f = path.join(self.__path, filename)
     content = self.read_preprocessed(filename)
     return doc2vec.TaggedDocument(content, [filename])
def build_corpus(file_pre):
    with open(file_pre + "_train.txt", 'r') as f:
        for line in f.readlines():
            line = line.strip("\n")
            line = line.split("\t")
            tag = int(line[-1])
            line_list = line[0].split(' ')
            # print(line_list,tag)
            yield d2v.TaggedDocument(line_list, [tag])
Beispiel #24
0
	def __iter__(self):
		f = open(fileName)
		for l in f:
			tabIndex = l.find('\t')
			asin = l[:tabIndex]
			text = l[tabIndex+1:]
			cleaned = re.sub('[^\w]', ' ', text).lower()
			yield d2v.TaggedDocument(words = re.split("\s*", cleaned), tags = [asin])
		f.close()
Beispiel #25
0
    def __next__(self):
        try:

            data = next(self.corpus)
            t = self.__preprocess(data[1])
            return doc2vec.TaggedDocument(t, [data[0]])
        except StopIteration:
            self.rebot()
            raise StopIteration
Beispiel #26
0
 def read_corpus(self):
     """
     Get all documents in the current folder in TaggedDocuments type
     :return: generator of documents to product TaggedDocuments
     """
     files = listdir(self.__path)
     for i, document in enumerate(files):
         content = self.read_preprocessed(document)
         yield doc2vec.TaggedDocument(content, [document])
Beispiel #27
0
def getCleanLabeledReviews(reviews):
    clean_reviews = []
    for review in reviews["review"]:
        clean_reviews.append(clear_review_to_words(review))

    labelized = []
    for i, id_label in enumerate(reviews["id"]):
        labelized.append(doc2vec.TaggedDocument(clean_reviews[i], [id_label]))
    return labelized
def tagged_iterator(text_iterator):
    """Processes texts in the doc_iterator and returns
    an iterator of tagged documents"""
    count = 0
    for bow in text_iterator:
        if len(bow) > 0:
            yield doc2vec.TaggedDocument(bow, [count])
            count += 1
    print count - 1
Beispiel #29
0
def train_model_by_tokens(documents_in_tokens: Iterable[Iterable[str]],
                          vector_size: int, min_count: int):
    documents = [
        doc2vec.TaggedDocument(tokens, [i])
        for i, tokens in enumerate(documents_in_tokens)
    ]
    return doc2vec.Doc2Vec(documents,
                           vector_size=vector_size,
                           min_count=min_count)
Beispiel #30
0
    def train_doc2vec(self):
        x_train = []
        for line in self.dataset:
            patent = json.loads(line)
            x_train.append(
                doc2vec.TaggedDocument(patent['words'],
                                       [patent['patent_number']]))

        self._train(x_train)