コード例 #1
0
ファイル: modelTraining.py プロジェクト: ymuaa/FYP
News_docs = [doc for doc in alldocs]
#print(News_docs[0])

print('%d News_docs' % (len(alldocs)))

#shuffle?
from random import shuffle
doc_list = alldocs[:]
shuffle(doc_list)

# 1. create the Doc2Vec model
# you need to adjust the hyper-parameters, e.g. size and iter
cores = multiprocessing.cpu_count()
simple_models = [
    # PV-DBOW plain
    Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, sample=0,
            epochs=20, workers=cores),
    # PV-DM w/ default averaging; a higher starting alpha may improve CBOW/PV-DM modes
    Doc2Vec(dm=1, vector_size=100, window=10, negative=5, hs=0, min_count=2, sample=0,
            epochs=20, workers=cores, alpha=0.05, comment='alpha=0.05'),
    # PV-DM w/ concatenation - big, slow, experimental mode
    # window=5 (both sides) approximates paper's apparent 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, vector_size=100, window=5, negative=5, hs=0, min_count=2, sample=0,
            epochs=20, workers=cores),
]

# 2. build vocabulary
for model in simple_models:
    model.build_vocab(alldocs)
    print("%s vocabulary scanned & state initialized" % model)

models_by_name = OrderedDict((str(model), model) for model in simple_models)
コード例 #2
0
def generate_word_embeddings(papers):
    global document_similarity
    lines = []
    with open(DATASET_INPUT_FILE_PATH + papers, 'rb') as f:
        for line in tqdm(f, desc='Read papers'):
            lines.append(json.loads(line))

    lines.sort(key=lambda x: x['year'])

    ids = extract_keys(lines, 'id')
    titles = extract_keys(lines, 'title')
    abstracts = extract_keys(lines, 'paperAbstract')
    out_citations = extract_keys(lines, 'outCitations')

    # TODO: DO NOT HARDCODE THIS
    is_test = False

    train_ids, eval_ids = split_data(ids, 0.8, 0.9, is_test)
    train_abstracts, eval_abstracts = split_data(abstracts, 0.8, 0.9, is_test)
    train_titles, eval_titles = split_data(titles, 0.8, 0.9, is_test)
    train_out_citations, eval_out_citations = split_data(
        out_citations, 0.8, 0.9, is_test)

    # dictionary = unk_train(train_abstracts)
    train_docs = create_tagged_doc(train_abstracts)

    model = Doc2Vec(workers=11,
                    min_count=5,
                    window=10,
                    size=100,
                    alpha=0.025,
                    iter=20)
    model.build_vocab(train_docs)
    model.train(train_docs,
                epochs=model.iter,
                total_examples=model.corpus_count)

    eval_score = []
    matching_citation_count = 1
    min_rank = float("inf")

    # TODO: changed eval_abstracts -> eval_titles
    for i, eval_abstract in tqdm(
            list(enumerate(eval_titles[:10])),
            desc='Generating rankings for evaluation set'):
        rankings = []
        eval_split = eval_abstract.lower().split()

        if len(eval_split):
            # TODO: changed train_abstracts -> train_titles
            for j, train_abstract in tqdm(
                    list(enumerate(train_titles)),
                    desc='Iterating through train titles'):
                train_split = train_abstract.lower().split()
                if len(train_split):
                    document_similarity = model.wmdistance(
                        train_split, eval_split)
                    rankings.append((document_similarity, j))
            rankings.sort(key=lambda x: x[0])

            out_citations = eval_out_citations[i]
            if len(out_citations):
                # gets the rankings of the training papers in the correct order
                ranking_ids = get_from_rankings(rankings, train_ids)
                true_citations = [
                    citation for citation in ranking_ids
                    if citation in out_citations
                ]

                if len(true_citations):
                    matching_citation_count += 1
                    rank = ranking_ids.index(true_citations[0]) + 1
                    min_rank = min(min_rank, rank)
                    eval_score.append(1.0 / rank)
                    print("\nEval Score for iteration " + str(i) + ": " +
                          str(1.0 / rank) + "\n")

    print("matching citation count = " + str(matching_citation_count))
    print(eval_score)
    print("min rank = " + str(min_rank))
    print(sum(eval_score) / matching_citation_count)
コード例 #3
0
    'TRAIN_NEG': 'train-neg.txt',
    'TRAIN_POS': 'train-pos.txt',
    'TRAIN_UNS': 'train-unsup.txt'
}
documents = TaggedLineDocs(sources)
log.info('loaded %i documents', len(documents.sentences))

epochs = 30
vec_size = 100

log.info('Initializing D2V model')
model = Doc2Vec(min_count=3,
                window=10,
                size=vec_size,
                sample=1e-4,
                negative=5,
                workers=4,
                dm=0,
                seed=seed,
                iter=epochs)
model.build_vocab(documents)

log.info('Training D2V Epochs %i', epochs)
model.train(documents, total_examples=model.corpus_count, epochs=model.iter)

log.info('Model Save')
model.save('./imdb.d2v')

# log.info('Load Pre-trained D2V Model')
# model = Doc2Vec.load('./imdb.d2v')
コード例 #4
0

train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['text']), tags=r.score),
    axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['text']), tags=r.score),
    axis=1)

print("1")
"""Building vocabulary"""

model_dbow = Doc2Vec(dm=0,
                     vector_size=300,
                     negative=5,
                     hs=0,
                     min_count=2,
                     sample=0,
                     workers=4)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

print("2")
"""Initialise model"""

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]),
                     total_examples=len(train_tagged.values),
                     epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha
コード例 #5
0

#

tagged_documents_iterator = LabeledLineSentence(
    documents_tokenized, range(len(documents_tokenized)))

#
list(tagged_documents_iterator)

#

model = Doc2Vec(size=500,
                window=10,
                workers=11,
                alpha=0.025,
                min_alpha=0.025,
                iter=10,
                min_count=1)

#

model.build_vocab(tagged_documents_iterator)

# In[12]:

model.wv

# In[13]:

model.train(tagged_documents_iterator,
コード例 #6
0
def run_doc2vec(train_docs,
                dev_docs,
                test_docs,
                dm,
                size,
                window,
                alpha,
                negative,
                sample,
                cores,
                min_count,
                passes,
                output,
                diagnostics=False):

    assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

    model = Doc2Vec(dm=dm,
                    size=size,
                    window=window,
                    alpha=alpha,
                    min_alpha=alpha,
                    negative=negative,
                    sample=sample,
                    workers=cores,
                    min_count=min_count,
                    iter=1)
    model.build_vocab(train_docs)

    train_shuffled = train_docs

    whole_duration = 0

    if (diagnostics):
        infer_vecs = np.zeros((len(test_docs), size))
        dev_vecs = np.zeros((len(dev_docs), size))
        test_vectors = dict()
        dev_vectors = dict()
        neighb_num = 10
        words = []
        for doc in train_docs:
            words += doc.words
        counter = collections.Counter(words)
        if not os.path.exists('diagnostics/'):
            os.mkdir('diagnostics/')
        i = output.find('/')
        diag_folder = 'diagnostics' + output[i:].replace(' ', '_').replace(
            '-', '').replace('.txt', '') + '/'
        if not os.path.exists(diag_folder):
            os.mkdir(diag_folder)
        tmp_dir = 'temp' + output[:i].replace('.txt', '') + '/'
        if not os.path.exists(tmp_dir):
            os.mkdir(tmp_dir)

        par_cols = [
            'dm', 'size', 'window', 'alpha', 'negative', 'sample', 'min_count',
            'epochs', 'cores'
        ]
        par_use = [
            dm, size, window, alpha, negative, sample, min_count, passes, cores
        ]
        par_df = pd.DataFrame(columns=par_cols)
        for par_c, par_u in zip(par_cols, par_use):
            par_df.loc[0, par_c] = par_u
        par_df.to_csv(diag_folder + 'pars.csv')

        df = pd.DataFrame(columns=['neighbours'])

        p_ids = np.linspace(0, len(train_docs) - 1, num=5)

        dev = np.zeros(passes + 1)
        train = np.zeros(passes + 1)
        train_N = 30
        train_ids_for_cost = np.linspace(0, len(train_docs) - 1, num=train_N)
        train_for_cost = []
        for i in train_ids_for_cost:
            train_for_cost += [train_docs[int(i)]]

    print("START %s" % datetime.datetime.now())

    with elapsed_timer() as elapsed:

        for epoch in range(passes):

            if (diagnostics):
                if (epoch == 0):
                    diagnose(diag_folder, model, counter, p_ids, neighb_num,
                             df, dev, train, epoch, alpha, passes,
                             train_for_cost, train_N, dev_docs, dev_vectors,
                             dev_vecs, output)
            shuffle(train_shuffled)
            if (epoch + 1 < 6):
                model.alpha = 0.2
            else:
                model.alpha = alpha
            model.min_alpha = model.alpha
            model.train(train_shuffled,
                        total_examples=len(train_docs),
                        epochs=1)

            print('epoch %d' % (epoch + 1))
            #N = 1000
            if (diagnostics):
                diagnose(diag_folder, model, counter, p_ids, neighb_num, df,
                         dev, train, epoch + 1, alpha, passes, train_for_cost,
                         train_N, dev_docs, dev_vectors, dev_vecs, output)

        for i, doc in enumerate(test_docs):
            infer_vecs[i, :] = model.infer_vector(doc.words,
                                                  alpha=alpha,
                                                  min_alpha=alpha,
                                                  steps=passes)
            test_vectors[i] = tuple([infer_vecs[i, :], doc.tags])

        test = cost(model, test_vectors, test_docs, len(test_docs))
        #print (test)

    whole_duration += elapsed()
    model.save(output)
    f = open(output + 'test', 'wb')
    pickle.dump(test_vectors, f)
    if (diagnostics):
        dev_pickle = open(diag_folder + 'dev.npy', 'wb')
        pickle.dump(dev, dev_pickle)
        train_pickle = open(diag_folder + 'train.npy', 'wb')
        pickle.dump(train, train_pickle)
        print('dev_cost (%d documents)' % len(dev_docs), dev)
        print('train_cost', train)
    print('infer_cost', test)

    print("END %s" % str(datetime.datetime.now()))
    print("duration %s" % str(whole_duration))
コード例 #7
0
file_logistic_reg = "clsf_logistic_reg_" + label_name + ".pickle"
p.dump(clsf_logistic_reg, open(file_logistic_reg, 'wb'))

# Report - Logistic Regression
pred_logistic_reg = clsf_logistic_reg.predict(x_test)
acc_logistic_reg = accuracy_score(pred_logistic_reg, y_test) * 100
"""## Doc2vec and Logistic Regression"""

## Doc2vec and Logistic Regression
x_train_labeled = label_sentences(x_train, 'train')
x_test_labeled = label_sentences(x_test, 'test')
all_data = x_train_labeled + x_test_labeled

clsf_dbow = Doc2Vec(dm=0,
                    vector_size=300,
                    negative=5,
                    min_count=1,
                    alpha=0.065,
                    min_alpha=0.065)
clsf_dbow.build_vocab([x for x in tqdm(all_data)])
for epoch in range(30):
    clsf_dbow.train(utils.shuffle([x for x in tqdm(all_data)]),
                    total_examples=len(all_data),
                    epochs=1)
    clsf_dbow.alpha -= 0.002
    clsf_dbow.min_alpha = clsf_dbow.alpha

train_vectors_dbow = get_vectors(clsf_dbow, len(x_train_labeled), 300, 'train')
test_vectors_dbow = get_vectors(clsf_dbow, len(x_test_labeled), 300, 'test')

clsf_d2v = LogisticRegression(n_jobs=1, C=1e5)
clsf_d2v = clsf_d2v.fit(train_vectors_dbow, y_train)
コード例 #8
0
from gensim import models
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
from collections import namedtuple
import gensim.models.doc2vec
doc1=["latest categories","my events",
           "get all my tags","all my feeds","new groups"]
LabeledSentence()
# Load data

# doc1 = ["This is a sentence", "This is another sentence"]

# Transform data (you can add more data preprocessing steps)

docs = []
analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
for i, text in enumerate(doc1):
    words = text.lower().split()
    tags = [i]
    docs.append(analyzedDocument(words, tags))

# Train model (set min_count = 1, if you want the model to work with the provided example data set)

model = Doc2Vec(docs, size = 100, window = 300, min_count = 1, workers = 4)

# Get the vectors
print(model.docvecs[0])
print(model.docvecs[1])
コード例 #9
0
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
from datetime import datetime
import sys

class LabeledLineSentence(object):
    def __init__(self, filename):
        self.filename = filename
    def __iter__(self):
        for uid, line in enumerate(open(self.filename)):
            words=line.strip().split()
            yield LabeledSentence(words[1:],[words[0]])

            
if __name__=='__main__':
    print(sys.argv[0])
    input_file=sys.argv[1]
    output_file=sys.argv[2]
    
    sentences=LabeledLineSentence(input_file)
    model = Doc2Vec(alpha=0.025, min_alpha=0.025)  # use fixed learning rate
    model.build_vocab(sentences)
    for epoch in range(10):
        print(datetime.now(),'epoch:',epoch)
        model.train(sentences)
        model.alpha -= 0.002  # decrease the learning rate
        model.min_alpha = model.alpha  # fix the learning rate, no decay
    model.save(output_file)
    print('运行完毕,文档向量已输出到',output_file)
コード例 #10
0
    with open("sentiment labelled sentences/%s_labelled.txt" % fname) as f:
        for item_no, line in enumerate(f):
            line_split = line.strip().split('\t')
            sent = line_split[0].lower()
            sent = re.sub(r'\'', '', sent)
            sent = re.sub(r'\W', ' ', sent)
            sent = re.sub(r'\s+', ' ', sent).strip()
            #sentences.append(LabeledSentence(sent.split(), ["%s_%d" % (fname, item_no)]))
            sentences.append(
                TaggedDocument(sent.split(), ["%s_%d" % (fname, item_no)]))
            sentiments.append(int(line_split[1]))

sentences

import random


class PermuteSentences(object):
    def __iter__(self):
        shuffled = list(sentences)
        random.shuffle(shuffled)
        for sent in shuffled:
            yield sent


permuter = PermuteSentences()

model = Doc2Vec(permuter, min_count=1)

#model.most_similar('tasty')
model.wv.most_similar('tasty')
コード例 #11
0
ファイル: doc2vec.py プロジェクト: vgupta123/UBR
    for text in entity:
        words = text.split()
        tags = [i]
        alldocs.append(SentimentDocument(words, tags))
        i = i + 1

doc_list = alldocs[:]  # for reshuffling per pass

cores = multiprocessing.cpu_count()

simple_models = [
    # PV-DBOW
    Doc2Vec(dm=0,
            size=400,
            negative=5,
            hs=0,
            min_count=20,
            workers=cores,
            dbow_words=1),
]

simple_models[0].build_vocab(
    alldocs
)  # PV-DM/concat requires one special NULL word so it serves as template
print(simple_models[0])

models_by_name = OrderedDict((str(model), model) for model in simple_models)

best_error = defaultdict(
    lambda: 1.0)  # to selectively-print only best errors achieved
コード例 #12
0
def main():
    '''
    create a parser to rad the command line arugments
    arguments are returned as an object
    '''
    parser = argparse.ArgumentParser(description="machine learning model")
    parser.add_argument('-c',
                        '--csv_filename',
                        default=None,
                        help='filename for the csv dataset')
    parser.add_argument('-d',
                        '--debug',
                        action='store_true',
                        help='turn on debug statements')

    args = parser.parse_args()

    if args.csv_filename:
        csv_filename = args.csv_filename
    else:
        parser.print_help()
        exit()
    if args.debug:
        debug = True
    else:
        debug = False
    '''
    open the csv file for processing
    '''
    if debug:
        print("Starting to read CSV")
    df = process_csv(csv_filename)

    if debug:
        print("Read CSV ---> DONE!")
    '''
    obtain the test number from the csv filename
    this is used later for saving model and classifier
    '''
    try:
        test_num = args.csv_filename.rsplit("/")[-1].rsplit("_")[0]
    except:
        test_num = 0
    '''
    perform labelling of given n-grams
    doc2vec requires each document to be labelled
    '''

    if debug:
        print("Starting to process ngrams")
    ngrams = df['ngram']
    labelled_ngrams = []
    for i in range(len(ngrams)):
        labelled_ngrams.append(TaggedDocument(ngrams[i].split(), [i]))

    if debug:
        print("Process ngrams ---> DONE!")
    '''
    create a model and export it to the given path
    '''
    if debug:
        print("Creating model")
    model = Doc2Vec(dm=1,
                    min_count=1,
                    window=10,
                    vector_size=150,
                    sample=1e-4,
                    negative=10)
    if debug:
        print("Model --> CREATED!")
    if debug:
        print("Training model")
    trained_model = train_d2v_model(model, labelled_ngrams, n_epochs=20)
    if debug:
        print("Model --> TRAINED!")

    d2v_path = os.path.join(os.get_pwd(), "doc2vec_model{}".format(test_num))
    save_d2v_model(trained_model, d2v_path)

    model_loaded = load_d2v_model(d2v_path)
    '''
    generate the target array
    this is needed as target can only be integers however in our
    case the dataset contains the 'Benign' and 'Malware' tags
    transformation is achieved using the module LabelEncoder from sklearn
    '''
    le = LabelEncoder()
    le.fit(["Benign", "Malware"])
    target = le.transform(df['label'])
    '''
    get the inference vectors from d2v model
    '''
    data = []
    for i in range(len(df['ngram'])):
        data.append(model_loaded[i])
    '''
    create a split for test and training data
    currently its 70% and 30%
    '''
    x_train, x_test, y_train, y_test = train_test_split(data,
                                                        target,
                                                        test_size=0.3,
                                                        random_state=0)
    '''
    initialize an SVM classifier and perform fitting for the given training data
    '''

    client = Client(processes=False)

    clf = svm.SVC(kernel='linear')

    with joblib.parallel_backend('dask'):
        clf.fit(x_train, y_train)
    '''
    create a prediction array to later compare with the test array
    '''
    y_pred = clf.predict(x_test)
    '''
    describe the classifer path and then save the classifier
    '''

    clf_path = "/home/architp/projects/def-daknox/architp/mitacs/ml/model{}.pkl".format(
        test_num)
    save_clf(clf, clf_path)
    '''
    finally print the stats
    '''
    print_stats(y_pred, y_test)
コード例 #13
0
import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

# In[24]:

tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(x)]

# In[25]:

len(tagged_data)

# In[26]:

model = Doc2Vec(tagged_data, vector_size=20, epochs=100, min_count=0)

#Save this doc2vec model
pickle.dump(model, open('docvec.pkl', 'wb'))

# In[27]:

model.corpus_count

# In[28]:

o_model = model.docvecs.vectors_docs

# In[29]:

o_model.shape
コード例 #14
0
ファイル: docvec.py プロジェクト: broshanfekr/Ms.Thesis
train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
doc_list = train_docs[:]  # for reshuffling per pass

print('%d docs: %d train-sentiment, %d test-sentiment' %
      (len(doc_list), len(train_docs), len(test_docs)))
##############################################################################################
cores = multiprocessing.cpu_count()
simple_models = [
    # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
    Doc2Vec(dm=1,
            dm_concat=1,
            size=150,
            window=10,
            sample=1e-2,
            negative=25,
            hs=0,
            min_count=2,
            workers=cores),
    # PV-DBOW
    Doc2Vec(dm=0,
            size=150,
            negative=25,
            window=10,
            hs=0,
            min_count=2,
            workers=cores),
    # PV-DM w/average
    Doc2Vec(dm=1,
            dm_mean=1,
コード例 #15
0

log.info('source load')
train_source = {'train-neg.txt':'TRAIN_NEG', 'train-pos.txt':'TRAIN_POS', 'test-neg.txt':'TEST_NEG', 'test-pos.txt':'TEST_POS', 'train-unsup.txt':'TRAIN_UNS'}
# train_source = {'train-neg.txt':'TRAIN_NEG', 'train-pos.txt':'TRAIN_POS', 'test-neg.txt':'TEST_NEG', 'test-pos.txt':'TEST_POS'}
test_source = {'test-neg.txt':'TEST_NEG', 'test-pos.txt':'TEST_POS'}

log.info('TaggedDocument')
train_sentences = TaggedLineSentence(train_source)
test_sentences = TaggedLineSentence(test_source)

marcos = train_sentences.to_array()


log.info('D2V')
model = Doc2Vec(min_count=1, window=10, vector_size=150, sample=1e-4, negative=5, workers=7,iter=40)
model.build_vocab(train_sentences.to_array())

log.info('Epoch')

# log.info('EPOCH: {}'.format(epoch))
model.train(train_sentences.sentences_perm(),total_examples=model.corpus_count,epochs=model.iter)

log.info('Model Save')
model.save('./imdbc.d2v')
model = Doc2Vec.load('./imdbc.d2v')

log.info('Sentiment')
train_arrays = numpy.zeros((25000, 150))
train_labels = numpy.zeros(25000)
コード例 #16
0
ファイル: doc2vec.py プロジェクト: UVA-DSI/2018-Capstone-PLOS
from gensim.models import Doc2Vec
import gensim.models.doc2vec

print('loading docs...')
start_time = time()
documents = [
    doc for doc in TaggedLineDocument('volume2/processed_body_docs.txt')
]
print("--- %s seconds ---" % (time() - start_time))

#documents = []
#with open('/volume/processed_body_docs.txt') as f:
#    for line in f:
#        documents.append(TaggedLineDocument(line))

print('training doc2vec model...')
start_time = time()
model = Doc2Vec(documents,
                vector_size=200,
                window=5,
                min_count=5,
                workers=14,
                epochs=20)
print("--- %s seconds ---" % (time() - start_time))

print('saving model...')
np.save('volume2/new_models/body_features-w2v-200.npy',
        model.docvecs.doctag_syn0)
model.save('volume2/new_models/body_features-w2v-200.doc2vec')

print('complete!')
コード例 #17
0
#Tagging document sentences 
comp_docs = [TaggedDocument(
                words=[word for word in document[0].lower().split()],
                tags = [i]
            ) for i, document in enumerate(train_docs)]


# In[48]:


max_epoch = 2
vec_size = 20
    
# Train model
model = Doc2Vec(size = vec_size, dm = 0, dbow_words = 1, window = 2, alpha = 0.2)
model.build_vocab(comp_docs)
for epoch in range(max_epoch):
    model.train(comp_docs, total_examples = model.corpus_count, epochs = epoch)
    
model.save("Doc2Vec.model")
print("Model Saved")


# In[78]:


def build_model(test_doc, compiled_doc):
    '''
    Parameters
    -----------
コード例 #18
0
    def __generate(self):
        # Generate full set of LabeledSentences
        full_labeled_sentences = []
        for tag, metadata_map in self.data_source_map.iteritems():
            full_labeled_sentences.extend(
                metadata_map[self.LABELED_SENTENCES_KEY])

        # Generate the model
        print('Instantiating Doc2Vec model...')
        self.model = Doc2Vec(documents=full_labeled_sentences,
                             min_count=10,
                             size=100,
                             workers=1,
                             sample=1e-4,
                             negative=5)

        # Save model with metadata such as current epoch time in filename for later processing
        # TODO: add in a utility to load historical models for comparison
        model_file_name = "review_model_%d.d2v" % int(round(
            time.time() * 1000))
        save_path = constants.GENERATED_MODEL_OUTPUT_DIR + model_file_name

        print('Finished model generation. Saving model to %s' % save_path)
        self.model.save(save_path)

        print('Finished model generation. Begin fitting classifier...')

        print('Constructing training vectors')
        positive_training_vectors = self.__get_vectors_list(
            constants.POSITIVE_TRAINING_TAG)
        negative_training_vectors = self.__get_vectors_list(
            constants.NEGATIVE_TRAINING_TAG)
        full_training_vectors = positive_training_vectors + negative_training_vectors

        print('Constructing training labels')
        positive_training_labels = numpy.ones(
            shape=len(positive_training_vectors))
        negative_training_labels = numpy.zeros(
            shape=len(negative_training_vectors))
        full_training_labels = numpy.concatenate(
            (positive_training_labels, negative_training_labels), axis=0)

        print('Fitting classifier to training data')
        self.classifier.fit(full_training_vectors, full_training_labels)

        print('Finished fitting classifier. Begin scoring classifier...')

        print('Constructing testing vectors')
        positive_testing_vectors = self.__get_vectors_list(
            constants.POSITIVE_TESTING_TAG)
        negative_testing_vectors = self.__get_vectors_list(
            constants.NEGATIVE_TESTING_TAG)
        full_testing_vectors = positive_testing_vectors + negative_testing_vectors

        print('Constructing testing labels')
        positive_testing_labels = numpy.ones(
            shape=len(positive_testing_vectors))
        negative_testing_labels = numpy.zeros(
            shape=len(negative_testing_vectors))
        full_testing_labels = numpy.concatenate(
            (positive_testing_labels, negative_testing_labels), axis=0)

        print('Scoring classifier')
        score = self.classifier.score(full_testing_vectors,
                                      full_testing_labels)

        print('Classifier received a score of %.4f' % score)
コード例 #19
0
ファイル: embedclef.py プロジェクト: libin19861023/RARE
 def train(self):
     self.model = Doc2Vec()
     self.model.build_vocab(self.labelledSents)
     for i in tqdm(range(10)):
         self.model.train(self.labelledSents)
コード例 #20
0
                            [prefix + '_%s' % item_no]))
        return self.sentences

    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences


from pathlib import Path

if not Path("./imdb.d2v").is_file():
    # file exists
    sentences = LabeledLineSentence(sources)
    model = Doc2Vec(min_count=1,
                    window=10,
                    size=100,
                    sample=1e-4,
                    negative=5,
                    workers=16)
    model.build_vocab(sentences.to_array())

    for epoch in range(20):
        log.info('Epoch %d' % epoch)
        model.train(
            sentences.sentences_perm(),
            total_examples=model.corpus_count,
            epochs=model.iter,
        )

    model.save('./imdb.d2v')

model = Doc2Vec.load('./imdb.d2v')
コード例 #21
0
ファイル: doc2vecModel.py プロジェクト: zzoliman/pyTextMiner
    def run(self, documents, output_base_dir, vocab_min_count, num_epochs,
            algorithm, vector_size, alpha, min_alpha, train, window, cores):

        # As soon as FAST_VERSION is not -1, there are compute-intensive codepaths that avoid holding
        # the python global interpreter lock, and thus you should start to see multiple cores engaged.
        # For more details see: https://github.com/RaRe-Technologies/gensim/issues/532
        # assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

        if cores == None:
            cores = multiprocessing.cpu_count()

        negative = 5
        hs = 0

        docvecs_mapfile = 'docvecs_map.txt'

        if algorithm == 'pv_dmc':
            # PV-DM with concatenation
            # window=5 (both sides) approximates paper's 10-word total window size
            # PV-DM w/ concatenation adds a special null token to the vocabulary: '\x00'
            model = Doc2Vec(dm=1,
                            dm_concat=1,
                            vector_size=vector_size,
                            window=window,
                            negative=negative,
                            hs=hs,
                            min_count=vocab_min_count,
                            workers=cores,
                            docvecs_mapfile=docvecs_mapfile)
        elif algorithm == 'pv_dma':
            # PV-DM with average
            # window=5 (both sides) approximates paper's 10-word total window size
            model = Doc2Vec(dm=1,
                            dm_mean=1,
                            vector_size=vector_size,
                            window=window,
                            negative=negative,
                            hs=hs,
                            min_count=vocab_min_count,
                            workers=cores,
                            docvecs_mapfile=docvecs_mapfile)
        elif algorithm == 'pv_dbow':
            # PV-DBOW
            model = Doc2Vec(dm=0,
                            vector_size=vector_size,
                            window=window,
                            negative=negative,
                            hs=hs,
                            min_count=vocab_min_count,
                            workers=cores,
                            docvecs_mapfile=docvecs_mapfile)
        else:
            raise ValueError('Unknown algorithm: %s' % algorithm)

        logging.info('Algorithm: %s' % str(model))

        logging.info('Build vocabulary')
        model.build_vocab(documents)
        vocab_size = len(model.wv.vocab)
        logging.info('Vocabulary size: %d', vocab_size)

        target_dir = self.make_timestamped_dir(output_base_dir, algorithm,
                                               model.vector_size, num_epochs,
                                               window)
        vocab_path = os.path.join(target_dir, 'vocabulary')
        logging.info('Save vocabulary to: %s', vocab_path)
        with open(vocab_path, 'w') as f:
            term_counts = [[term, value.count]
                           for term, value in model.wv.vocab.items()]
            term_counts.sort(key=lambda x: -x[1])
            for x in term_counts:
                f.write('%s, %d\n' % (x[0], x[1]))

        if train:
            logging.info('Shuffle documents')
            shuffle(documents)

            logging.info('Train model')
            model.train(documents,
                        total_examples=len(documents),
                        epochs=num_epochs,
                        start_alpha=alpha,
                        end_alpha=min_alpha)

            logging.info('Save model to: %s', target_dir)
            model.delete_temporary_training_data(keep_doctags_vectors=True,
                                                 keep_inference=True)
            model.save(os.path.join(target_dir, 'doc2vec.model'))

            model_meta = {
                'argv': sys.argv,
                'target_dir': target_dir,
                'algorithm': algorithm,
                'window': window,
                'vector_size': vector_size,
                'alpha': alpha,
                'min_alpha': min_alpha,
                'num_epochs': num_epochs,
                'vocab_min_count': vocab_min_count,
                'vocab_size': vocab_size,
                'cores': cores,
                'negative': negative,
                'hs': hs
            }

            model_meta_path = os.path.join(target_dir, 'model.meta')
            logging.info('Save model metadata to: %s', model_meta_path)
            with open(model_meta_path, 'w') as outfile:
                json.dump(model_meta, outfile)
コード例 #22
0
    '15同居多年未办理结婚登记,是否可以向法院起诉要求离婚'
]

# 使用doc2vec来判断
cores = multiprocessing.cpu_count()
print(cores)
corpora_documents = []
for i, item_text in enumerate(raw_documents):
    #words_list = util_words_cut.get_class_words_list(item_text)
    words_list = list(jieba.cut(item_text))
    document = TaggedDocument(words=words_list, tags=[i])
    corpora_documents.append(document)

print(corpora_documents[:2])

model = Doc2Vec(size=89, min_count=1, iter=10)
model.build_vocab(corpora_documents)
model.train(corpora_documents,
            total_examples=model.corpus_count,
            epochs=model.iter)

print('#########', model.vector_size)

test_data_1 = '你好,我想问一下我想离婚他不想离,孩子他说不要,是六个月就自动生效离婚'
#test_cut_raw_1 = util_words_cut.get_class_words_list(test_data_1)
test_cut_raw_1 = list(jieba.cut(test_data_1))
print(test_cut_raw_1)
inferred_vector = model.infer_vector(test_cut_raw_1)
print(inferred_vector)
sims = model.docvecs.most_similar([inferred_vector], topn=3)
print(sims)
コード例 #23
0
train_corpus[2]

## A TaggedDocument(List of Word Tokens, Int of Tag)

## Model Training

%%time
from gensim.models import Doc2Vec
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=100)
model.build_vocab(train_corpus) 
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)

models = [
    # PV-DBOW (Skip-Gram equivalent of Word2Vec)
    Doc2Vec(dm=0, dbow_words=1, vector_size=200, window=8, min_count=10, epochs=50),
    
    # PV-DM w/average (CBOW equivalent of Word2Vec)
    Doc2Vec(dm=1, dm_mean=1, vector_size=200, window=8, min_count=10, epochs =50),
]

## Concatenated Model

## Train both PV-DBOW and PV-DM and combine the two

documents = train_corpus
models[0].build_vocab(documents)
models[1].reset_from(models[0])

for model in models:
   model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
コード例 #24
0
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')
p_stemmer = PorterStemmer()
en_stop = get_stop_words('en')
tokenizer = RegexpTokenizer(r'\w+')
#This project is a small sample, applying doc2vec with small set of sample (1 million instances)
#Main ideas of doc2vec: convert a list of words into a specific vector,
#then compare the similarity of two sentences by compare distance between two vectors

#1/ Create a model for clustering problem
# **Note: Must determine parameter in this model depend on the user
# model=Doc2Vec(dm=1,dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2)
model = Doc2Vec(alpha=0.025, min_alpha=0.025)

#2/ Create a data set for training
# - Use all questions( question in attribute question1 and question2) in both file train and test as data for training
# - Each question is a separate "doc" need to be converted into a "vec" at some specific space

# sent_raw="What is the step by step guide to invest in share market in india?"
# # model.build_vocab(sentences=[LabeledSentence()])
# sent=LabeledSentence(words=sent_raw.split(" "),tags=[1])
# model.build_vocab(sentences=[sent])
# print(model.vocab)
# model.train(sentences=[sent])


def read_data():
    df = pd.read_csv("train.csv", encoding='utf8')
コード例 #25
0
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing
import time

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

simple_models = [
    # PV-DBOW
    Doc2Vec(dm=0,
            vector_size=100,
            min_count=3,
            window=10,
            negative=5,
            hs=0,
            workers=cores),
    # PV-DM
    Doc2Vec(dm=1,
            vector_size=300,
            window=10,
            negative=5,
            hs=0,
            min_count=2,
            workers=cores),
]

model = simple_models[0]
model.build_vocab(docData, update=False)
コード例 #26
0
for file in files:
    review = ''
    with open('aclImdb/test/neg/{}'.format(file), 'r', encoding='utf-8') as f:
        for word in word_tokenize(f.read()):
            if lemm.lemmatize(word) not in stop_words:
                review += ' ' + word
        f.close()
    review_list.append(review)
    labels_list.append('neg_' + file)

it = LabeledLineSentence(doc_list=review_list, labels_list=labels_list)

model = Doc2Vec(size=3000,
                window=10,
                dm=0,
                alpha=0.025,
                min_alpha=0.025,
                min_count=5,
                workers=multiprocessing.cpu_count())

model.build_vocab(it)
model.train(it, total_examples=4000, epochs=20)

model.save('partial_Doc2Vec.model')

model = Doc2Vec.load('partial_Doc2Vec.model')

x_train = np.zeros((2000, 3000))
y_train = np.zeros(2000)

files = os.listdir('aclImdb/train/pos')[:1000]
コード例 #27
0
def convert_sentences(sentence_list):
    for i in range(len(sentence_list)):
        for char in ['.', ',', '!', '?', ';', ':']:
            sentence_list[i] = sentence_list[i].replace(char, ' ' + char + ' ')
    return [TaggedDocument(words=sentence_list[i].split(), tags=[i]) for i in range(len(sentence_list))]

def normalize(x,p=2):
    xx = np.linalg.norm(x, p)
    return x / xx if xx else x


cores = multiprocessing.cpu_count()

content = ["A head motion during brain imaging has been recognized as a source of image degradation and introduces distortion in positron emission tomography (PET) image. There are several techniques to correct the motion artifact, but these techniques cannot correct the motion during scanning. The aim of this study is to develop a sinogram-based motion correction (SBMC) method to correct directly the head motion during PET scanning using a motion tracking system and list-mode data acquisition. This method is a rebinning procedure by which the lines of response (LOR) are geometrically transformed according to the current values of the six-dimensional motion data. Michelogram was recomposed using rebinned LOR and motion corrected sinogram was generated. In the motion corrected image, the blurring artifact due to motion was reduced by SBMC method.", "A hierarchical controller for dealing with faults and adverse environmental conditions on an automated highway system is proposed. The controller extends a previous control hierarchy designed to work under normal conditions of operation. The faults are classified according to the capabilities remaining on the vehicle or roadside after the fault has occurred. Information about these capabilities is used by supervisors in each of the layers of the hierarchy to select appropriate fault handling strategies. We outline the strategies needed by the supervisors and give examples of their detailed operation", "A highly dependable embedded fault-tolerant memory architecture for high performance massively parallel computing applications and its dependability assurance techniques are proposed and discussed in this paper. The proposed fault tolerant memory provides two distinctive repair mechanisms: the permanent laser redundancy reconfiguration during the wafer probe stage in the factory to enhance its manufacturing yield and the dynamic BIST/BISD/BISR (built-in-self-test-diagnosis-repair)-based reconfiguration of the redundant resources in field to maintain high field reliability. The system reliability which is mainly determined by hardware configuration demanded by software and field reconfiguration/repair utilizing unused processor and memory modules is referred to as HW/SW Co-reliability. Various system configuration options in terms of parallel processing unit size and processor/memory intensity are also introduced and their HW/SW Co-reliability characteristics are discussed. A modeling and assurance technique for HW/SW Co-reliability with emphasis on the dependability assurance techniques based on combinatorial modeling suitable for the proposed memory design is developed and validated by extensive parametric simulations. Thereby, design and Implementation of memory-reliability-optimized and highly reliable fault-tolerant field reconfigurable massively parallel computing systems can be achieved.","A highly efficient color correction approach based on color-encoded fringe projection is proposed, which combine color image segmentation and color intensity interpolation technique. Only 24 designed color patterns are projected and recorded to implement the process with a high brightness DLP projector and a color camera. To establish the correspondence between the designed color intensity and recorded color intensity, the recorded image is firstly segmented into some adjacent grid region by neighboring pixel intensity fitting error, the grid region is then grown to the region boundary employing some process algorithm, thirdly, the region number is labeled and adjusted based on the designed color pattern by searching the region centre coordinate and applying a man-machine conversation method, finally, the color correspondence relation is established according to the designed color pattern pixel index and the labeled grid region number of recorded image. While doing the color correction, firstly, the initial color intensity is searched according to the minimum color distance between the recorded color and designed color. Secondly, color interpolation is implemented to obtain the true color intensity correspondence to recorded color. The proposed approach validity is testified by experiment results.","A high-performance line conditioner with excellent efficiency and power factor is proposed. The line conditioner consists of a three-leg rectifier-inverter, which operates as a boost converter and a buck converter. This boost-buck topology enables constant output voltage regulation, irrespective of input voltage disturbances. In addition the three-leg bridge can reduce the number of switching devices and system loss, while maintaining the capabilities of power factor correction and good output voltage regulation. The power factor controller for the single-phase pulse-width modulated (PWM) rectifier is derived using the feedback linearisation concept. The inverter side acts as a voltage regulator with current-limiting capability for impulsive loads. The disturbance of input voltage is detected using a fast-sensing technique. Experimental results obtained on a 3 kVA prototype show a normal efficiency of over 95% and input power factor of over 99%.","A high-tech information electronic equipment of some given type is designed in order to proceed automatically fault detection and improve the efficiency and accuracy of diagnosis. This thesis which is a part of the program introduces the research of algorithm of fault diagnose expert system of a power supply circuit board of an electronic device and algorithm realization and example proving on the hardware platform. It's quicker and more convenient to locate fault on the circuit boards with this equipment. It's proved that this expert system can solve the problems of high cost and long intervals of maintenance and keep the equipment in a stable status", "A hydrogen-powered fuel cell vehicle is developed, in which a distributed control and communication system based on CAN (Controller Area Network) is built. For vehicle diagnostic purpose, a new on-board fault diagnosis strategy is presented. There are two efficient automotive diagnostic systems based on CAN designed and implemented in this paper: (1)CANoe is a powerful CAN development tool. A fault diagnosis environment based on CANoe is established to satisfy the needs of on-board and off-board fault diagnosis application of FCV. By setting up the communication interface between CANoe and Access, the vehicle fault codes are collected and stored. Meanwhile a database is designed for the management of fault information. (2) A hand-held fault diagnosis equipment as well as a windows analyzer interface is set up. All fault information from FCVpsilas CAN network can be gotten easily by the equipment. With the Serial Communication between the equipment and PC, the fault codes stored in the equipment can be read, analyzed and disposed by PC.", "A key attribute of any tester for FLIR systems is a calibrated uniform source. A uniform source ensures that any anomalies in performance are artifacts of the FLIR being tested and not the tester. Achieving a uniform source from a resistor array based portable infrared scene projector requires implementation of nonuniformity correction algorithms instead of controlling the bonding integrity of a source to a cooler, and the coating properties of the source typical of a conventional blackbody. The necessity to perform the non-uniformity correction on the scene projector is because the source is a two-dimensional array comprised of discrete resistive emitters. Ideally, each emitter of the array would have the same resistance and thus produce the same output for a given drive current. However, there are small variations from emitter to emitter over the thousands of emitters that comprise an array. Once a uniform output is achieved then the output must be calibrated for the system to be used as test equipment. Since the radiance emitted from the monolithic array is created by flowing current through micro resistors, a radiometric approach is used to calibrate the differential output of the scene projector over its dynamic range. The focus of this paper is to describe the approach and results of implementing non-uniformity correction and calibration on a portable infrared scene projector.", "A kind of routing scheme with the ability to tolerate the faults is necessary in the massively parallel multiprocessors. In this paper, we have proposed a kind of fault-tolerant routing scheme in the tori networks. The new routing scheme is called the two-level-turn-model routing scheme, which is based on our investigation of the fault-tolerant properties of the turn-model. Through employing two specific kinds of turn model, our routing scheme could tolerate the convex faults and the concave faults both with a few limitations to their shape. At most five virtual channels would be used to avoid the deadlock occurrence in the tori, no matter whether the fault regions are connected and no matter where the faults locate. Actually, if the fault regions encompass no physical boundary nodes in the tori, totally four virtual channels, each pair for each turn model, would be sufficient to preclude the occurrence of the deadlock. At last, the simulation shows the effectiveness of our scheme."]
content1 = convert_sentences(content)
model = Doc2Vec(size=10, window=10, min_count=5, workers=cores,alpha=0.025, min_alpha=0.025)

model.build_vocab(content1)


for epoch in range(10):
    model.train(content1, total_examples=model.corpus_count, epochs=1000)
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay
set_trace()
model.
while True:
    x = model.infer_vector(content[0], alpha=model.alpha, min_alpha=model.min_alpha)
    y = model.infer_vector(content[0], alpha=model.alpha, min_alpha=model.min_alpha)
    x = normalize(x)
    y = normalize(y)
コード例 #28
0
 def __init__(self, model='doc2v', **params):
     self.embedding = model
     if self.embedding == 'doc2v':
         self.fn_model = Doc2Vec(**params)
コード例 #29
0
    return tokens


train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['Desc']), tags=[r.Code]),
    axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['Desc']), tags=[r.Code]),
    axis=1)

print(train_tagged.values[30])

model_dmm = Doc2Vec(dm=1,
                    dm_mean=1,
                    window=10,
                    negative=5,
                    min_count=1,
                    workers=5,
                    alpha=0.065,
                    min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

# for epoch in range(30):
#     model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values),
#                     epochs=1)
#     model_dmm.alpha -= 0.002
#     model_dmm.min_alpha = model_dmm.alpha

print("Load modello doc2vec")
model_dmm = Doc2Vec.load("0Model_Dmm.bin")

y_train, X_train = vec_for_learning(model_dmm, train_tagged)
コード例 #30
0
#This function does all cleaning of data using two objects above
def nlp_clean(data):
    new_str = data.lower()
    dlist = tokenizer.tokenize(new_str)
    dlist = list(set(dlist).difference(stopword_set))
    return dlist

class LabeledLineSentence(object):
    def __init__(self, filename1,filename2):
        self.filename1 = filename1
        self.filename2 = filename2
    def __iter__(self):
        for uid, (line1,line2) in enumerate(zip(open(self.filename1),open(self.filename2))):
            yield gensim.models.doc2vec.LabeledSentence(nlp_clean(line1),line2.split())

sentences = LabeledLineSentence('/home/eric/Data/reviewlist','/home/eric/Data/idlist')
import gensim
from gensim.models import Doc2Vec
import os
import logging

logging.basicConfig(level=logging.INFO)
print(os.sched_getaffinity)
os.sched_setaffinity(0, range(4))

assert gensim.models.doc2vec.FAST_VERSION > -1

model = Doc2Vec(sentences, size=300, min_count=10, alpha=0.025, min_alpha=0.001, workers=multiprocessing.cpu_count(),iter = 15)  

model.save("/home/eric/Data/doc2vec.model")