コード例 #1
0
ファイル: gloveTrain.py プロジェクト: AshkenSC/Python-Gadgets
from glove import Glove
from glove import Corpus

'''数据集导入'''
# 将文本行存入列表
i = 1
lines = []
for line in open('clean_data.txt', encoding='utf-8'):
    lines.append(line.split(' '))
    print("appending line " + str(i))
    i += 1

# 准备数据集
corpus_model = Corpus()
corpus_model.fit(lines, window=10)
#corpus_model.save('corpus.model')
print('Dictionary size: %s' % len(corpus_model.dictionary))
print('Collocations: %s' % corpus_model.matrix.nnz)


'''训练模型'''
gl = Glove(no_components=200, learning_rate=0.05)
gl.fit(corpus_model.matrix, epochs=5,
          no_threads=1, verbose=True)
gl.add_dictionary(corpus_model.dictionary)


'''模型保存'''
gl.save('glove.model')

コード例 #2
0
def train_glove(inst, meta_data={}):

    start_total = datetime.now()

    meta_data["glove_params"] = settings.GLOVE_PARAMS

    glove_paramgrid = ParameterGrid(settings.GLOVE_PARAMS)

    for params in glove_paramgrid:

        start = datetime.now()
        # MAKE CORPUS
        # set corpus filepath
        corpus_fp = os.path.join(settings.WVEC_OPT_DIRP, '{}_window{}.glovecorpus'.format(
            settings.DATASET,
            params["window"]))
        # load if corpus exists
        if os.path.isfile(corpus_fp):
            logging.info("Loading existing corpus {}.".format(corpus_fp))
            corpus_model = Corpus.load(corpus_fp)
            logging.info("Successfully loaded existing corpus {}.".format(corpus_fp))
        # make a new coocurrence corpus if it does not exist
        else:
            logging.info("Creating new corpus at {}.".format(corpus_fp))
            corpus_model = Corpus()
            corpus_model.fit(inst, window=params["window"])
            os.makedirs(settings.WVEC_OPT_DIRP, exist_ok=True)
            corpus_model.save(corpus_fp)

        logging.info("Dict size: {}.".format(len(corpus_model.dictionary)))
        logging.info("Collocations: {}.".format(corpus_model.matrix.nnz))

        # GLOVE VECTOR TRAINING
        glove = Glove(no_components=params["dims"], learning_rate=params["lr"])

        logging.info("Start fitting GloVe with parameters: {}.".format(params))
        glove.fit(corpus_model.matrix, epochs=params["epochs"],
                  no_threads=params["njobs"], verbose=False)
        glove.add_dictionary(corpus_model.dictionary)

        os.makedirs(settings.WVEC_OPT_DIRP, exist_ok=True)
        model_name = 'glove.{}_w{}_lr{}_ep{}.{}d.glovemodel'.format(settings.DATASET,
                                                                    params["window"],
                                                                    params["lr"],
                                                                    params["epochs"],
                                                                    params["dims"])
        glove.save(os.path.join(settings.WVEC_OPT_DIRP, model_name))

        duration = (datetime.now() - start).total_seconds()
        meta_data["models"][model_name] = params
        meta_data["models"][model_name]["duration_training"] = duration

        logging.info("Finished fitting GloVe {} in {}s with parameters: {}.".format(
            model_name,
            duration,
            params))
        # SIMILARITY TEST
        for test_word in settings.TESTSIM_WORDS:
            if test_word not in meta_data["most_similar"]:
                meta_data["most_similar"][test_word] = {}

            logging.info("Querying model {} for {} most similar to \'{}\':".format(
                model_name,
                settings.N_TESTSIM,
                test_word))
            sim = glove.most_similar(test_word, number=settings.N_TESTSIM)
            meta_data["most_similar"][test_word][model_name] = sim

            logging.info(pprint.pformat(sim))

    total_duration = (datetime.now() - start_total).total_seconds()
    meta_data["glove_duration_training"] = total_duration

    return meta_data
コード例 #3
0
def main(model_select):
    data = pd.read_excel("./data/doc_set_final4.xlsx")
    data.token = data.token.apply(lambda x: literal_eval(x))
    data = data.sample(frac=1, random_state=1234)

    token_list = data.token.tolist()
    target = data[['new_class', 'new_small_class']]
    train_x_data, test_x_data, train_y, test_y = train_test_split(
        token_list,
        target,
        test_size=0.3,
        stratify=target,
        shuffle=True,
        random_state=1234)

    if model_select == 'w2v':
        w2v_name = 'base_token'
        print("모델 학습")
        word2vec_kargs = {
            'num_features': 300,
            'num_workers': 4,
            'window': 8,
            'seed': 1234,
            'min_word_count': 5,
            'min_alpha': 0.025,
            'iter': 30
        }
        model = word2vec_model(train_x_data, **word2vec_kargs)
        print("모델 저장")
        model_name = './model/word_embedding/Word2vec1({}).model'.format(
            w2v_name)
        model.save(model_name)

    elif model_select == 'd2v':
        TaggedDocument = namedtuple('TaggedDocument', 'words tags')
        tagged_train_docs = [
            TaggedDocument(d, [c[1]['new_class'], c[1]['new_small_class']])
            for d, c in zip(train_x_data, train_y.iterrows())
        ]
        print("모델 학습")
        doc2vec_kargs = {
            'size': 300,
            'window': 8,
            'min_count': 5,
            'alpha': 0.025,
            'min_alpha': 0.025,
            'workers': 4,
            'seed': 1234,
            'iter': 50
        }
        model = doc2vec_model(tagged_train_docs, **doc2vec_kargs)
        print("모델 저장")
        model.save('./model/word_embedding/Doc2vec_new_small2_4.model')

    elif model_select == 'fasttext':
        print("모델 학습")
        ft_kargs = {
            'size': 300,
            'window': 5,
            'min_count': 3,
            'workers': 4,
            'seed': 1234
        }
        model = fasttext_model(train_x_data, **ft_kargs)
        print("모델 저장")
        model.save('./model/word_embedding/FastText.model')

    elif model_select == 'glove':
        glove_kargs = {
            'size': 300,
            'lr': 0.005,
            'random_state': 1234,
            'no_threads': 4,
            'epoch': 30
        }
        corpus = Corpus()
        corpus.fit(train_x_data, window=8)
        glove = Glove(no_components=glove_kargs['size'],
                      learning_rate=glove_kargs['lr'])
        glove.fit(corpus.matrix,
                  epochs=glove_kargs['epoch'],
                  no_threads=glove_kargs['no_threads'],
                  verbose=True)
        glove.add_dictionary(corpus.dictionary)
        print("모델 저장")
        glove.save('./model/word_embedding/glove.model')
    else:
        print("3가지 방식 중에 고르시오")
コード例 #4
0
    for sublist in description:
        for item in sublist:
            desc_text.append(item)
    """        
   
    print(len(fulltext))
    print(len(fulltext[0]))
    print(len(fulltext[1]))
  
    
    
    corpus=Corpus()
    desc=Corpus()
    corpus.fit(fulltext,window=10) # length of the (symmetric)context window used for cooccurrence
    desc.fit(description,window=10)
    desc_glove= Glove(no_components = 100 ,learning_rate=0.05)
    desc_glove.fit(desc.matrix,epochs=30,no_threads=4,verbose=True)
    desc_glove.add_dictionary(desc.dictionary)
    desc_glove.save('/Volumes/Untitled/WithDescription/CS/desc_glove.tsv')
    glove= Glove(no_components=390,learning_rate=0.05)
    glove.fit(corpus.matrix,epochs=30,no_threads=4,verbose=True)
    glove.add_dictionary(corpus.dictionary)
    #tsne = TSNE(n_components=2, verbose=1,perplexity=2,method='exact')
    #tsne_results = tsne.fit_transform(desc_glove.word_vectors)
    print(corpus.dictionary)
    content_vector=glove.word_vectors  #vector with word embeddings

    with open("/Volumes/Untitled/WithDescription/CS/content.tsv","w+") as my_csv:
       
         csvWriter = csv.writer(my_csv,delimiter=' ')
         csvWriter.writerows(content_vector)
コード例 #5
0
# In[19]:

corpus = Corpus()

# In[20]:

corpus.fit(sentence_corpus, window=10)

# In[21]:

import sys

# In[23]:

glove = Glove(no_components=300, learning_rate=0.01)

# In[24]:

glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)

# In[25]:

glove.add_dictionary(corpus.dictionary)

# In[26]:

len(glove.dictionary)

# In[29]:
コード例 #6
0
ファイル: train_deep.py プロジェクト: iwhitey/Bot-Detection
    # tokenized = ['<rt>' if token == 'rt' else token for token in tokenized] ##add <rt> token since it exists in glove
    #tokenized = [token for token in tokenized if not token in stopwords.words('english')] ##remove stopwords
    #print(tokenized)
    #tokenized = list(map(lambda x : lemmatizer.lemmatize(x), tokenized))

lr = 1e-4
num_epoch = 9
batch_size_train = 64
batch_size_test = 32
gradient_clip = 0.25
torch.manual_seed(7052020)
np.random.seed(7052020)

########################## choose embeddings ##########################
embedding_source = 'glove'
embeddings = Glove(glove_path).glove_dict
# embedding_source = 'bert'
# embeddings = Path(bert_path_train)
_time = time()

# bert_dict_train = None
# with open(Path(bert_path_train), 'r') as f:
#     bert_dict_train = json.load(f)
# print("Initialized train embeddings!")

# bert_dict_test = None
# with open(Path(bert_path_test), 'r') as f:
#     bert_dict_test = json.load(f)
# print("Initialized test embeddings!")

train_dataset = DatasetPyTorch(
コード例 #7
0
train = idx[:split]
test = idx[split:]

X_train = data['review'].values[train]

del data
gc.collect()

preprocessor = get_preprocessor(stem=True, stop=True, min_length=3)
folder = 'dump'
os.makedirs(folder, exist_ok=True)

file = 'setup.npz'
if file in os.listdir(folder):
    logger(f'found saved {file}')
    glove = Glove(None, preprocessor, random_state=2021)
    glove.load(f'{folder}/{file}')
else:
    glove = Glove(X_train, preprocessor, random_state=2021)
    glove.dump_co_occurance(f'{folder}/{file}')
    logger(f'saved {file}')

del X_train
gc.collect()

print()
for dim in [2, 10, 50, 100, 200, 300, 400, 500, 600]:
    # Needs more iterations to converge for higher dims
    glove.fit(dim,
              eta=1e-2,
              epochs=200 if dim < 300 else 1000,
コード例 #8
0
    print('[{}] Reading corpus from file...'.format(chalk.yellow(CORPUS_FILE)))
    corpus = Corpus.load(CORPUS_FILE)
else:
    nx_G = util.get_nx_graph()
    walks = util.get_node2vec_walks(nx_G)
    corpus = Corpus()
    corpus.fit(walks, window=WINDOW_SIZE)
    print('[{}] Writing corpus file...'.format(chalk.green(CORPUS_FILE)))
    corpus.save(CORPUS_FILE)

if os.path.exists(GLOVE_MODEL_FILE) and not args.train:
    print('[{}] Reading glove model from file...'.format(
        chalk.yellow(GLOVE_MODEL_FILE)))
    glove = Glove.load(GLOVE_MODEL_FILE)
else:
    glove = Glove(no_components=VECTOR_DIMENSION, learning_rate=0.05)
    glove.fit(corpus.matrix,
              epochs=GLOVE_EPOCHS,
              no_threads=PARALLEL_WORKER_COUNT,
              verbose=True)
    glove.add_dictionary(corpus.dictionary)
    print('[{}] Writing glove file...'.format(chalk.green(GLOVE_MODEL_FILE)))
    glove.save(GLOVE_MODEL_FILE)
if args.query:
    dictionary = glove.dictionary
    print(glove.word_vectors[glove.dictionary[args.query]])
    print(glove.most_similar(args.query, number=40))


def get_glove_model():
    return glove
コード例 #9
0
def corpus_to_glove(corpus):
    glove_model = Glove(no_components=50, learning_rate=0.07)
    glove_model.fit(corpus.matrix, epochs=1, no_threads=2)
    glove_model.add_dictionary(corpus.dictionary)
    return glove_model
コード例 #10
0
def generateModel(traces, runID):
    linesentencedataKey = ""
    linesentencetimeKey = ""
    linesentencenumberKey = ""
    linesentencebiburstDataKey = ""
    linesentencebiburstTimeKey = ""
    linesentencePackLen = ""
    sentencesFile = "model/" + runID + "sentences.txt"
    modelFile = "model/" + runID + "mygloveModel"
    myFile = open(sentencesFile, 'w')
    mypackCount = 0
    for trace in traces:
        linesentencedataKey = ""
        linesentencetimeKey = ""
        linesentencenumberKey = ""
        linesentencebiburstDataKey = ""
        linesentencebiburstTimeKey = ""
        linesentencePackLen = ""

        directionCursor = None
        dataCursor = 0
        timeCursor = 0
        prevTimeCursor = 0
        burstTimeRef = 0
        numberCursor = 0

        secondBurstAndUp = False
        prevDataCursor = 0
        prevDirectionCursor = None

        for packet in trace.getPackets():
            if directionCursor == None:
                directionCursor = packet.getDirection()

            if packet.getDirection() != directionCursor:
                dataKey = 'S' + str(directionCursor) + '-' + str(
                    GloveClassifier.roundArbitrary(dataCursor, 600))
                #dataKey = 'S'+str(directionCursor)+'-'+str(dataCursor)

                if config.GLOVE_OPTIONS['burstSize'] == 1:
                    linesentencedataKey = linesentencedataKey + " " + dataKey
                #directionCursor = packet.getDirection()
                #dataCursor      = 0

                timeKey = 'T' + str(directionCursor) + '-' + str(timeCursor)

                if config.GLOVE_OPTIONS['burstTime'] == 1:
                    linesentencetimeKey = linesentencetimeKey + " " + timeKey
                burstTimeRef = packet.getTime()

                # number marker
                numberKey = 'N' + str(directionCursor) + '-' + str(
                    numberCursor)
                if config.GLOVE_OPTIONS['burstNumber'] == 1:
                    linesentencenumberKey = linesentencenumberKey + " " + numberKey
                numberCursor = 0

                # BiBurst
                if secondBurstAndUp:
                    biBurstDataKey = 'Bi-'+str(prevDirectionCursor)+'-'+str(directionCursor)+'-'+ \
                                     str( GloveClassifier.roundArbitrary(prevDataCursor, 600) )+'-'+ \
                                     str( GloveClassifier.roundArbitrary(dataCursor, 600) )

                    if config.GLOVE_OPTIONS['biBurstSize'] == 1:
                        linesentencebiburstDataKey = linesentencebiburstDataKey + " " + biBurstDataKey


                    biBurstTimeKey = 'BiTime-'+str(prevDirectionCursor)+'-'+str(directionCursor)+'-'+ \
                                     str( prevTimeCursor )+'-'+ \
                                     str( timeCursor )

                    if config.GLOVE_OPTIONS['biBurstTime'] == 1:
                        linesentencebiburstTimeKey = linesentencebiburstTimeKey + " " + biBurstTimeKey

                prevTimeCursor = timeCursor
                timeCursor = 0

                secondBurstAndUp = True
                prevDataCursor = dataCursor
                dataCursor = 0
                prevDirectionCursor = directionCursor
                directionCursor = packet.getDirection()

            dataCursor += packet.getLength()
            timeCursor = packet.getTime() - burstTimeRef
            numberCursor += 1

            if config.GLOVE_OPTIONS['packetSize'] == 1:
                linesentencePackLen = linesentencePackLen + " " + str(
                    packet.getLength()) + "_" + str(packet.getDirection())

        if dataCursor > 0:
            #key = 'S'+str(directionCursor)+'-'+str( dataCursor)
            key = 'S' + str(directionCursor) + '-' + str(
                GloveClassifier.roundArbitrary(dataCursor, 600))
            if config.GLOVE_OPTIONS['burstSize'] == 1:
                linesentencedataKey = linesentencedataKey + " " + key

            timeKey = 'T' + str(directionCursor) + '-' + str(timeCursor)
            if config.GLOVE_OPTIONS['burstTime'] == 1:
                linesentencetimeKey = linesentencetimeKey + " " + timeKey

            numberKey = 'N' + str(directionCursor) + '-' + str(numberCursor)
            if config.GLOVE_OPTIONS['burstNumber'] == 1:
                linesentencenumberKey = linesentencenumberKey + " " + numberKey

            # BiBurst
            if secondBurstAndUp:
                #biBurstDataKey = 'Bi-'+str(prevDirectionCursor)+'-'+str(directionCursor)+'-'+ \
                #                 str( prevDataCursor )+'-'+ \
                #                 str( dataCursor )
                biBurstDataKey = 'Bi-'+str(prevDirectionCursor)+'-'+str(directionCursor)+'-'+ \
                                 str( GloveClassifier.roundArbitrary(prevDataCursor, 600) )+'-'+ \
                                 str( GloveClassifier.roundArbitrary(dataCursor, 600) )

                if config.GLOVE_OPTIONS['biBurstSize'] == 1:
                    linesentencebiburstDataKey = linesentencebiburstDataKey + " " + biBurstDataKey


                biBurstTimeKey = 'BiTime-'+str(prevDirectionCursor)+'-'+str(directionCursor)+'-'+ \
                                 str( prevTimeCursor )+'-'+ \
                                 str( timeCursor )

                if config.GLOVE_OPTIONS['biBurstTime'] == 1:
                    linesentencebiburstTimeKey = linesentencebiburstTimeKey + " " + biBurstTimeKey

        myFile.write(linesentencePackLen + linesentencedataKey +
                     linesentencetimeKey + linesentencenumberKey +
                     linesentencebiburstDataKey + linesentencebiburstTimeKey)
        myFile.write("\n")

    myFile.close()
    if config.CLASSIFIER == config.GLOVE_CLASSIFIER:
        sentences = models.word2vec.LineSentence(sentencesFile)
        corpus = Corpus()

        corpus.fit(sentences, window=config.GLOVE_PARAMETERS['window'])
        glove = Glove(no_components=config.GLOVE_PARAMETERS['no_components'],
                      learning_rate=config.GLOVE_PARAMETERS['learning_rate'])
        glove.fit(corpus.matrix,
                  epochs=config.GLOVE_PARAMETERS['epochs'],
                  no_threads=10,
                  verbose=False)
        glove.add_dictionary(corpus.dictionary)
        glove.save(modelFile)

    elif config.CLASSIFIER == config.W2V_CLASSIFIER:
        txt = open(sentencesFile)
        # print txt.read()
        if (len(txt.read()) > 0):
            #print "in here"
            txt.close()
            sentences = models.word2vec.LineSentence(sentencesFile)
            model = models.word2vec.Word2Vec(sentences,
                                             size=50,
                                             window=15,
                                             min_count=1,
                                             workers=4)
            model.save("word2vecModel")
        txt.close()
コード例 #11
0
try:
    with open('text8') as f:
        words = f.read()
except:
    msg = 'Missing "text8" file!\nTry "wget https://data.deepai.org/text8.zip" and unzipping text8.zip then retrying this script!'
    raise Exception(msg)

def preprocessor(text, to_tokens = False):
    if to_tokens:
        return text.split()
    return [text.split()]

file = 'setup.npz'
if file in os.listdir(folder):
    logger(f'found saved {file}')
    glove  = Glove(None   , preprocessor, random_state = 2021, x_min = 2, x_max = 20)
    glove.load(f'{folder}/{file}')
else:
    start  = process_time()
    glove  = Glove([words], preprocessor, random_state = 2021, x_min = 2, x_max = 20)
    time   = process_time() - start
    glove.dump_co_occurance(f'{folder}/{file}', time = time)

del words; gc.collect()

print()
for dim in [2, 10, 50, 100, 200, 300, 400, 500, 600]:
    filename = f'glove-{dim}.npz'
    start    = process_time()
    glove.fit(dim, eta = 0.5, epochs = 500, optimiser = 'adam', decay = 1e-2)
    time     = process_time() - start
コード例 #12
0
  def train(self, epochs=30,no_threads=None):
    """
    Train with own Data(s)
    Support single or multiple corpus or dataframe.
    Parameters:
    -----------
    model_name(optional): preferred model name
    epochs : int : total epochs for training
    no_threads(optional): int : no of threads for training

    Example
    --------
    >>> from ekushey.feature_extraction import BN_GloVe

    #Training Against Sentences
    >>> glv = BN_GloVe(sentences=[['আমার', 'প্রিয়', 'জন্মভূমি'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'] ])
    >>> glv.train()

    #Training Against one Text Corpus
    >>> glv = BN_GloVe(corpus_file="path_to_corpus.txt")
    >>> glv.train()

    #Training Against Multiple Corpuses
    path
      ->corpus
        ->1.txt
        ->2.txt
        ->3.txt

    >>> glv = BN_GloVe(corpus_path="path/corpus")
    >>> glv.train(epochs=25)

    #Training Against a Dataframe Column

    >>> glv = BN_GloVe(df= news_data['text_content'])
    >>> glv.train(epochs=25)



    """
    if not(self.sentences) and  not(self.corpus_file) and not(self.corpus_path) and self.df is None:
      raise Exception('Data is not given')
    elif self.sentences:
      data = self.sentences
      print("got sentence")
    elif self.corpus_file:
      print("got sentence")
      data = PathLineSentences(self.corpus_file)
    elif self.corpus_path:
      print("got sentence")
      data = PathLineSentences(self.corpus_path)
    elif self.df is not None:
      print("Dataframe got")
      data = '\n'.join(self.df)
      data = data.split('\n')
      data = [sent.split() for sent in data]
    else:
      print("Unexpected error occured: Please check your data file again.")

    
    
    if no_threads is None:
      no_threads = self.cpu_cores

    t = time()
    corpus = Corpus()
    corpus.fit(data, window=self.window)
    print('Dict size: %s' % len(corpus.dictionary))
    glove = Glove(no_components=self.size, learning_rate=self.n)
    glove.fit(corpus.matrix, epochs=epochs, no_threads=no_threads, verbose=True)
    print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))
    glove.add_dictionary(corpus.dictionary)
    print("Saving model to current directory")
    glove.save(self.model_name)
コード例 #13
0
import settings
from protocolparser import ProtocolParser
from glove import Glove
from datetime import datetime

test_time = 1000 / (10 * settings.polling_rate) * 1000000  # Microseconds

# Open up the connection
serverMACAddress = settings.MAC_Juuso
port = 3
s = socket.socket(socket.AF_BLUETOOTH, socket.SOCK_STREAM,
                  socket.BTPROTO_RFCOMM)
s.connect((serverMACAddress, port))

# Make a virtual glove and parser
glove = Glove(always_send_all_sensor_data=settings.glove_sends_all_data)
parser = ProtocolParser()
parser.init_send(2)

start_time = datetime.now()
last_poll_time = datetime.now()
last_movement_time = datetime.now()

hz_in_microsecs = 1 / (settings.polling_rate * 1000 * 1000)
test_ticks = 10000  # The amount of times our finger changes physical positions

# Run loop for the duration of the test
while (last_poll_time - start_time).microseconds < test_time:
    # Increment finger position if needed
    if (datetime.now() -
            last_movement_time).microseconds > test_time / test_ticks:
コード例 #14
0
start_time = time.time()
sentences = []

for word in vocabulary:
    sentences.extend(randomNWalkUniform(triples, word, walks, path_depth))

elapsed_time = time.time() - start_time
print('Time elapsed to generate features:',
      time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

#de models

#GloVe
corpus = Corpus()
corpus.fit(sentences, window=10)
glove_500 = Glove(no_components=10, learning_rate=0.05)
glove_500.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove_500.add_dictionary(corpus.dictionary)
glove_500.save('glove_10.model')

#GloVe
glove_200 = Glove(no_components=15, learning_rate=0.05)
glove_200.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove_500.add_dictionary(corpus.dictionary)
glove_500.save('glove_15.model')

#fasttext 500
print("start fast 10")
modelf = gensim.models.FastText(size=10,
                                workers=5,
                                window=10,
コード例 #15
0
#%%
# 训练Glove词向量
from glove import Glove
from glove import Corpus

# 读取训练数据。先转换成Corpus形式
sentense = []
with open("poem_for_embedding.txt") as f:
    for line in f.readlines():
        sentense.append(line.replace("\n", "").split(" "))
corpus_model = Corpus()
corpus_model.fit(sentense, window=5)  # window: 滑动窗口大小

# 训练glove
embedding_dim = 10
glove = Glove(no_components=embedding_dim, learning_rate=0.05)  # no_components:词嵌入维度,
glove.fit(corpus_model.matrix, epochs=10, no_threads=4, verbose=True)  # verbose:训练时是否打印info
glove.add_dictionary(corpus_model.dictionary)

glove.save(f'glove_{embedding_dim}.txt')

# glove = Glove.load(f'glove_{embedding_dim}.txt')
# glove.most_similar('我', number=10)
コード例 #16
0
		self.wseq = wseq

	def __iter__(self):
		for i in range(0,self.mtx.shape[0]):
			b = np.asarray(self.mtx[i,:].todense())[0]
			idx = get_hot_idx(b)
			doc = get_word_seq(idx,self.wseq)
			yield list(doc)
			
for b in range(len(cuts)):
	corpus = iterate_corpus(d[:cuts[b],:],wseq)

	corpus_model = Corpus()

	corpus_model.fit(corpus,window=8)

	# corpus_model.save('08_Glove/corpus.model')

	print('Dict size: %s' % len(corpus_model.dictionary))
	print('Collocations: %s' % corpus_model.matrix.nnz)

	glove = Glove(no_components=fsize, learning_rate=0.05)

	glove.fit(corpus_model.matrix, epochs=ite, no_threads=6, verbose=True)

	glove.add_dictionary(corpus_model.dictionary)

	glove.save('08_Glove/model_d'+str(fsize)+'_size'+str(cuts[b])+'.model')


コード例 #17
0
ファイル: glove_train.py プロジェクト: wenshuoliu/DLproj
import pandas as pd
import numpy as np
import os
from glove import Glove
from ccs_tools import dx_multi, pr_multi

DX_cat = ['missing'] + sorted(dx_multi.ICD9CM_CODE)
PR_cat = ['missing'] + sorted(pr_multi.ICD9CM_CODE)
code_cat = ['missing'] + sorted(dx_multi.ICD9CM_CODE) + sorted(pr_multi.ICD9CM_CODE)

n_DX_cat = len(DX_cat)
n_PR_cat = len(PR_cat)
n_code_cat = len(code_cat)

path = '/nfs/turbo/umms-awaljee/wsliu/Data/NRD/'
model_path = path + 'models/'
if not os.path.exists(model_path): os.mkdir(model_path)
    
g = Glove(input_dim=n_code_cat, embedding_dim=100)
cooccur_df = pd.read_csv(path+'all/cooccur_df.csv')
g.train_glove(cooccur_df=cooccur_df, cache_path=model_path, epochs=100, verbose=2)

embed_mat = g.get_embed_mat()

np.save(path+'all/embed_mat0823.npy', embed_mat)
コード例 #18
0
ファイル: main.py プロジェクト: tracy-talent/research
    with open('../../output/vocabs_100.txt', 'r') as vbf:
        for line in vbf.readlines():
            vocab.append(line.strip())

    # 建立词典,统计共现矩阵
    dictionary = {}
    for i, word in enumerate(vocab):
        dictionary[word] = i
    corpus = []
    with open('../../input/wiki.500.txt', 'r') as cf:
        for line in cf.readlines():
            corpus.append([word for word in line.split()])
    corpus_obj = Corpus(dictionary=dictionary)
    corpus_obj.fit(corpus, window=10, ignore_missing=True)  # 得到稀疏的上三角矩阵
    corpus_obj.save('../../output/corpus_obj')
    # corpus_obj = Corpus.load('../../output/corpus_obj') # self.dictionary, self. matrix

    glove = Glove(no_components=100,
                  learning_rate=0.05,
                  alpha=0.75,
                  max_count=1000,
                  max_loss=10.0,
                  random_state=None)
    glove.fit(corpus_obj.matrix, epochs=100, no_threads=6, verbose=True)
    glove.add_dictionary(dictionary=dictionary)
    wordvectors = glove.word_vectors.round(decimals=6)
    with open('../../output/glove100.wv', 'w') as wvf:
        for i, wv in enumerate(wordvectors):
            wvf.write(vocab[i] + ' ' + str(list(wv))[1:-1].replace(', ', ' ') +
                      '\n')
コード例 #19
0
def topk_recall_glove_embedding(click_all,
                                dict_label,
                                k=100,
                                dim=88,
                                epochs=30,
                                learning_rate=0.5):

    import psutil
    from glove import Glove
    from glove import Corpus

    data_ = click_all.groupby(
        ['pred',
         'user_id'])['item_id'].agg(lambda x: ','.join(list(x))).reset_index()
    list_data = list(data_['item_id'].map(lambda x: x.split(',')))

    corpus_model = Corpus()
    corpus_model.fit(list_data, window=999999)

    glove = Glove(no_components=dim, learning_rate=learning_rate)
    glove.fit(corpus_model.matrix,
              epochs=epochs,
              no_threads=psutil.cpu_count(),
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)

    list_user_id = []
    list_item_similar = []
    list_score_similar = []
    print('------- glove 召回 ---------')
    for i, row in tqdm(data_.iterrows()):

        list_item_id = row['item_id'].split(',')

        dict_item_id_score = {}
        for i, item in enumerate(list_item_id[::-1]):
            most_topk = glove.most_similar(item, number=k)
            for item_similar, score_similar in most_topk:
                if item_similar not in list_item_id:
                    if item_similar not in dict_item_id_score:
                        dict_item_id_score[item_similar] = 0
                    sigma = 0.8
                    dict_item_id_score[item_similar] += 1.0 / (
                        1 + sigma * i) * score_similar
        dict_item_id_score_topk = sorted(dict_item_id_score.items(),
                                         key=lambda kv: kv[1],
                                         reverse=True)[:k]
        assert len(dict_item_id_score_topk) == k
        dict_item_id_set = set([
            item_similar
            for item_similar, score_similar in dict_item_id_score_topk
        ])
        assert len(dict_item_id_set) == k
        for item_similar, score_similar in dict_item_id_score_topk:
            list_item_similar.append(item_similar)
            list_score_similar.append(score_similar)
            list_user_id.append(row['user_id'])

    topk_recall = pd.DataFrame({
        'user_id': list_user_id,
        'item_similar': list_item_similar,
        'score_similar': list_score_similar
    })
    topk_recall['next_item_id'] = topk_recall['user_id'].map(dict_label)
    topk_recall['pred'] = topk_recall['user_id'].map(
        lambda x: 'train' if x in dict_label else 'test')

    return topk_recall
コード例 #20
0
topics = [[] for i in range(len(df))]
para = [[] for i in range(len(df))]
topics1 = [[] for i in range(len(df))]
para1 = [[] for i in range(len(df))]
for i in range(len(df_combined)):
    text = df_combined.iloc[i][0]
    text = str(text)
    topics[i] = preprocess_text(text)
    text = df_combined.iloc[i][1]
    text = str(text)
    para[i] = preprocess_text(text)

corpus = Corpus()
corpus.fit(para, window=10)
glove = Glove(no_components=5, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
# glove.save('glove.model')

corpus1 = Corpus()
corpus1.fit(topics, window=10)
glove1 = Glove(no_components=5, learning_rate=0.05)
glove1.fit(corpus1.matrix, epochs=30, no_threads=4, verbose=True)
glove1.add_dictionary(corpus1.dictionary)

import numpy as np


def get_answers(df_combined, query1):
    query = Answer_Pre_Processing(query1)
コード例 #21
0
ファイル: get_cooccur_all.py プロジェクト: wenshuoliu/DLproj
code_cat = ['missing'] + sorted(dx_multi.ICD9CM_CODE) + sorted(
    dx_multi.ICD9CM_CODE) + sorted(pr_multi.ICD9CM_CODE)

DX1_dict = dict(zip(DX1_cat, range(len(DX_cat))))
DX_dict = dict(zip(DX_cat, [0] + list(range(len(DX_cat), len(DX_cat) * 2))))
PR_dict = dict(
    zip(PR_cat, [0] +
        list(range(len(DX_cat) * 2 - 1,
                   len(DX_cat) * 2 + len(PR_cat) - 1))))

DXs = ['DX' + str(j) for j in range(2, 31)]
PRs = ['PR' + str(j) for j in range(1, 16)]

unclassified = set(dx_multi.loc[dx_multi.CCS_LVL1 == '18', 'ICD9CM_CODE'])

g = Glove(input_dim=len(code_cat), embedding_dim=100)

#dtypes = dict(zip(DXs, [bytes]*30))
#dtypes.update(zip(PRs, [bytes]*15))

dxpr_df = pd.read_csv(path + 'raw/2014/NRD_2014_Core.CSV',
                      sep=',',
                      header=None,
                      names=core_cols,
                      dtype=core_dtypes_pd,
                      na_values=na_values,
                      chunksize=500000)

chunk_id = 0
for df in dxpr_df:
    start = time.time()
corpus4 = Corpus() 


corpus4.fit(inputPosts2, window=10)
glove4 = Glove(no_components=100, learning_rate=0.05)

glove4.fit(corpus4.matrix, epochs=1000, no_threads=10, verbose=True)
glove4.add_dictionary(corpus4.dictionary)
glove4.save('GPStemmedOneList.model')

corpus = Corpus() 



corpus.fit(inputPosts, window=10)
glove = Glove(no_components=100, learning_rate=0.05)

glove.fit(corpus.matrix, epochs=1000, no_threads=10, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('GP.model')'''

corpus2 = Corpus()

corpus2.fit(inputPosts2, window=10)
glove2 = Glove(no_components=100, learning_rate=0.05)

glove2.fit(corpus2.matrix, epochs=1000, no_threads=10, verbose=True)
glove2.add_dictionary(corpus2.dictionary)
glove2.save('MedHelpStemmed.model')
コード例 #23
0
cooccur_all = pd.read_csv(path+'multi_space_glove/cooccur_df_all_10.csv')
all_df = pd.read_csv(path+'cohorts20/{}/pred_comorb.csv'.format(cohort), dtype=core_dtypes_pd)

preprocessed = preprocess(all_df, DX1_cat=DX1_cat, DX_cat=DX_cat, PR_cat=PR_cat)
DX1_dict = preprocessed['DX1_dict']
DX_dict = preprocessed['DX_dict']
PR_dict = preprocessed['PR_dict']
code_cat = preprocessed['code_cat']
hosp_cat = preprocessed['hosp_cat']
dx1_ccs_dict = preprocessed['dx1_ccs_dict']
dx_ccs_dict = preprocessed['dx_ccs_dict']
pr_ccs_dict = preprocessed['pr_ccs_dict']
parent_pairs = preprocessed['parent_pairs']
hosp_cat = preprocessed['hosp_cat']

g = Glove(input_dim=len(code_cat), embedding_dim=code_embed_dim, count_cap=count_cap)
g.train_glove(cooccur_df=cooccur_all, cache_path=model_path+'temp/{}/'.format(job_index), batch_size=1024*8, epochs=80, earlystop_patience=10, reducelr_patience=2, parent_pairs=parent_pairs, lamb=penalty, metric=penalty_metric, verbose=2)
embed_mat = g.get_embed_mat()

all_df = preprocessed['int_df']
tst_key = pd.read_csv(path+'cohorts20/{}/tst_key{}.csv'.format(cohort, tst_seed), names = ['KEY_NRD'])
tst_df = all_df.loc[all_df.KEY_NRD.isin(tst_key.KEY_NRD)]
train_df0 = all_df.loc[~all_df.KEY_NRD.isin(tst_key.KEY_NRD)].reset_index()

## convert different variables into different np.array
n_DX = 29
n_PR = 15
DXs = ['DX'+str(j) for j in range(2, n_DX+2)]
PRs = ['PR'+str(j) for j in range(1, n_PR+1)]

age_mean = train_df0['AGE'].mean()
コード例 #24
0
def train_and_eval_crf(thread_ids, posts, labels, max_posts=20,
                       max_words=400, frac=[0.8, 0.1, 0.1], seed=0,
                       batch_size=9, embedding='glove', max_epoch=500,
                       validate=False, result_dir=None):

    # preliminary check
    if len(thread_ids) != len(posts) or \
        len(thread_ids) != len(labels) or \
        len(posts) != len(labels):
        raise Exception('Invalid length of data.')

    if len(frac) != 3 or frac[0]+frac[1]+frac[2] != 1:
        raise Exception('Invalid value of frac.')
    
    if frac[0] <= 0 or frac[1] <= 0 or frac[2] <= 0:
        raise Exception('Invalid value(s) for one or more frac element(s).')

    if embedding not in ['glove']:
        raise Exception('Invalid embedding.')
    
    train_texts, train_labels, test_texts, test_labels, val_texts, val_labels = utils.filter_and_shuffle_data(thread_ids, posts, labels, max_words, max_posts, seed, frac)
    
    # from here on is glove specific implementation (may need to extract to a function)
    print('Init embedding')
    glove = Glove()
    glove.create_custom_embedding([item for sublist in train_texts for item in sublist])
    glove.add_to_embedding(['.', '!', '?'])

    print('Padding and packing data into data loader')
    for i, thread in enumerate(train_texts):
        for j, post_text in enumerate(thread):
            train_texts[i][j] = glove.sentence_to_indices(post_text, seq_len=max_words)
    for i, thread in enumerate(test_texts):
        for j, post_text in enumerate(thread):
            test_texts[i][j] = glove.sentence_to_indices(post_text, seq_len=max_words)
    for i, thread in enumerate(val_texts):
        for j, post_text in enumerate(thread):
            val_texts[i][j] = glove.sentence_to_indices(post_text, seq_len=max_words)

    # padding at the post level
    post_padding = [glove.word2idx[glove.pad_token]] * max_words
    for posts in [train_texts, test_texts, val_texts]:
        for sublist in posts:
            if len(sublist) < max_posts:
                sublist.extend([post_padding] * (max_posts - len(sublist)))
    
    train_masks, test_masks, val_masks = [], [], []
    def get_to_append(ones):
        to_append = [1] * len(labels)
        if len(to_append) < max_posts:
            to_append.extend([0] * (max_posts-len(labels)))
        return to_append

    for labels in train_labels:
        to_append = get_to_append(len(labels))
        train_masks.append(to_append)

    for labels in test_labels:
        to_append = get_to_append(len(labels))
        test_masks.append(to_append)

    for labels in val_labels:
        to_append = get_to_append(len(labels))
        val_masks.append(to_append)

    for labels in [train_labels, test_labels, val_labels]:
        for sublist in labels:
            if len(sublist) < max_posts:
                sublist.extend([0] * (max_posts-len(sublist)))

    train_loader = utils.to_data_loader(batch_size, train_texts, train_labels, train_masks)
    test_loader = utils.to_data_loader(batch_size, test_texts, test_labels, test_masks)
    val_loader = utils.to_data_loader(batch_size, val_texts, val_labels, val_masks)

    print('Creating model')
    embedding = create_emb_layer(torch.from_numpy(glove.weights_matrix).float().to(utils.get_device()))
    model = hLSTM_CRF(num_tags=2,
                      input_size=glove.emb_dim,
                      hidden_size=glove.emb_dim, 
                      output_size=glove.emb_dim,
                      batch_size=batch_size,
                      num_layers=1, 
                      bidirectional=False,
                      embedding=embedding, 
                      drop_prob=0.5,
                      max_output=max_posts,
                      device=utils.get_device())

    labels = [label for sublist in train_labels for label in sublist]
    
    intervention_ratio = len([label for label in labels if label == 1]) / len(labels)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    writer = None
    if result_dir is not None:
        writer = SummaryWriter(f'runs/{result_dir}')

        if not os.path.exists(f'models/{result_dir}'):
            os.makedirs(f'models/{result_dir}')

    if not validate:
        val_loader = None

    print('Start training model')
    model.zero_grad()
    model.train()

    running_loss = 0.0
    for epoch in range(max_epoch):
        if (epoch + 1) % 20 == 0:
            print(f'Training model ({epoch + 1} / {max_epoch})')

        for i, (inputs, labels, masks) in enumerate(train_loader):
            inputs, labels, masks = inputs.to(utils.get_device()), labels.to(utils.get_device()), masks.to(utils.get_device())

            loss = model.loss(inputs, labels, masks)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
            optimizer.step()

            running_loss += loss.item()
            if i % 1000 == 999: # every 100 mini-batches
                if writer is not None:
                    writer.add_scalar('training loss', 
                                      running_loss / 1000,
                                      epoch * len(train_loader) + i)
                    running_loss = 0.0
                
                if val_loader is not None:
                    f1, _, _ = eval_model(model, val_loader)
                    writer.add_scalar('validation f1', f1,
                                      epoch * len(train_loader) + i)

    print('Evaluating model')
    f1, precision, recall = eval_model(model, test_loader, False)

    print(f'''
    Test results:
    F1 = {f1}
    Precision = {precision}
    Recall = {recall}
    ''')

    if result_dir is not None:
        print('Saving final model')
        torch.save(model.state_dict(), f'models/{result_dir}/final_model.pth')

    print('DONE :)))')
コード例 #25
0
# trg_word2vec = Word2Vec.load('trg_embedd.model')

# for i in range(src_vocabsize):
#     word = list(SRC.vocab.stoi.keys())[i]
#     if word in src_word2vec.wv.index2word:
#         src_embed_mtrx[SRC.vocab.stoi[word]] = torch.tensor(src_word2vec.wv[word].copy()).to(device)
#
# for i in range(trg_vocabsize):
#     word = list(TRG.vocab.stoi.keys())[i]
#     if word in trg_word2vec.wv.index2word:
#         trg_embed_mtrx[TRG.vocab.stoi[word]] = torch.tensor(trg_word2vec.wv[word].copy()).to(device)

'''
for glove
'''
glove = Glove()
src_glove = glove.load('src_glove.model')
trg_glove = glove.load('trg_glove.model')

for word in list(SRC.vocab.stoi.keys()):
    if word in src_glove.dictionary:
        src_embed_mtrx[SRC.vocab.stoi[word]] = torch.tensor(src_glove.word_vectors[src_glove.dictionary[word]].copy()).to(device)

for word in list(TRG.vocab.stoi.keys()):
    if word in trg_glove.dictionary:
        trg_embed_mtrx[SRC.vocab.stoi[word]] = torch.tensor(trg_glove.word_vectors[trg_glove.dictionary[word]].copy()).to(device)

print("pretrained word embeddings loaded")
sys.stdout.flush()

'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
コード例 #26
0
# Calculate distribution, to account for 95th percentile of messages.
max_sentence_length = int(np.mean([len(x) for x in texts]) + (norm.ppf(0.95) * np.std([len(x) for x in texts])))

print("Max sentence length: {}, put that in settings.json.".format(max_sentence_length))

corpus = Corpus()
try:
    print("Loading pretrained corpus...")
    corpus = Corpus.load("cache/corpus.p")
except:
    print("Training corpus...")
    corpus.fit(texts, window=max_sentence_length)
    corpus.save("cache/corpus.p")

glove = Glove(no_components=number_components, learning_rate=0.05)
try:
    print("Loading pretrained GloVe vectors...")
    glove = Glove.load("cache/glove.p")
except:
    print("Training GloVe vectors...")
    # More epochs seems to make it worse
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    glove.save("cache/glove.p")

# Convert input text
print("Vectorizing input sentences...")
X = vectify(texts, previous_message, glove.dictionary, max_sentence_length, contextual)
y = np.array([x == u'1' for x in classes]).astype(np.int32)
コード例 #27
0
ファイル: main.py プロジェクト: LieAnn/CCC_EAoP
                ETRI_mod_dict=ETRI_dependency_mod_dict,
                weight=weight)
            print('add dependency complete')
            ''' reduce co-occurrence matrix'''
            remove_pos_list = [
                'JKS', 'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ', 'JX', 'JC',
                'EP', 'EF', 'EC', 'ETN', 'ETM', 'XPN', 'XSN', 'XSV', 'XSA',
                'XR', 'SF', 'SE', 'SSO', 'SSC', 'SC', 'SY', 'SL', 'SH', 'SN',
                'UNKNOWN', 'UNDETERMINED'
            ]
            co_occurrence_mat, vocab2idx, idx2vocab = reduce_matrix(
                co_occurrence_mat, remove_pos_list, vocab2idx, idx2vocab)
            print("Remove unnecessary POS complete")
            ''' Train the glove '''
            co_occurrence_csrmat = sparse.csr_matrix(co_occurrence_mat)
            glove = Glove(no_components=n_dim)
            ret_dict = glove.fit(co_occurrence_csrmat.tocoo(),
                                 epochs=n_epoch,
                                 verbose=True)
            glove.add_dictionary(vocab2idx)
            print("Training glove complete")

            for i in range(n_epoch):
                if i % 100 == 0 or i == n_epoch - 1:
                    one_word_vectors = ret_dict[i]
                    '''  explore the glove '''
                    emotion_centroid_dict = find_emotion_centroid(
                        one_word_vectors, emotion2word_dict, vocab2idx)
                    ''' convert word vector to 6 dim vector'''
                    matrix_based_emotion, emotion_order = transform_based_emotion(
                        one_word_vectors,
コード例 #28
0
import tensorflow as tf
import pandas as pd
from glove import Corpus, Glove

MAX_WORDS = 200000
MAX_LEN = 200000

patents = pd.read_csv("txtheaders.csv")
patent_text = patents["txt"].str.lower()
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_WORDS, lower=True, oov_token="OOV")
tokenizer.fit_on_texts(patent_text)
patent_sequences = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(patent_text), maxlen=MAX_LEN)

patent_words = []
for t in range(patent_sequences.shape[0]):
    pt1 = tokenizer.sequences_to_texts(patent_sequences[t].reshape(1, MAX_LEN))
    pt2 = [x for x in pt1[0].split(" ") if
           not any(char.isdigit() for char in x)
           and len(x)<16]
    patent_words.extend(pt2)

corpus = Corpus()
corpus.fit(patent_words, window=20)
embeddings = Glove(no_components=200, learning_rate=0.05)
embeddings.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
embeddings.add_dictionary(corpus.dictionary)
embeddings.save('patent.glove')
# embeddings = glove.load('patent.glove')

print(embeddings.word_vectors[embeddings.dictionary['computer']])
コード例 #29
0
    newfather[s] = f
id2code = {}
for k in types:
    id2code[types[k]] = k
corMat = np.zeros((len(types), len(types)))
for i in range(len(types)):
    corMat[i, i] = 1
    curId = i
    while curId in newfather:
        f = newfather[curId]
        corMat[i, f] = 1
        curId = f
trainFile = '../data/mimic/mimic.train'
coOccurMat = np.zeros((len(types), len(types)))
trainSet = pickle.load(open(trainFile, 'rb'))[0]
for patient in trainSet:
    for visit in patient:
        augmented = np.nonzero(sum(corMat[visit]))[0]
        listNum = len(augmented)
        for i in range(listNum):
            for j in range(i + 1, listNum):
                coOccurMat[augmented[i]][augmented[j]] += 1
                coOccurMat[augmented[j]][augmented[i]] += 1
coOccurMatzip = coo_matrix(coOccurMat.astype(np.float))
glove = Glove(no_components=128, learning_rate=0.05)
glove.fit(coOccurMatzip, epochs=50, no_threads=1.0, verbose=True)
res = glove.word_vectors.astype(np.float32)
pickle.dump(
    (types, newfather, corMat, res), open(resFile, 'wb'), -1
)  # save types (node to id, newfather, cormat and pre-trained embeddings)
コード例 #30
0
def generateModel(traces):
    linesentence = ""
    myFile = open("sentences.txt", 'w')
    mypackCount = 0
    for trace in traces:
        for packet in trace.getPackets():
            mypackCount = mypackCount + 1
            linesentence = linesentence + " " + str(
                packet.getLength()) + "_" + str(packet.getDirection())
            # reset sentences
            if mypackCount % 40 == 0:
                myFile.write(linesentence)
                myFile.write("\n")
                mypackCount = 0
                linesentence = ""

        directionCursor = None
        dataCursor = 0
        for packet in trace.getPackets():
            if directionCursor == None:
                directionCursor = packet.getDirection()

            if packet.getDirection() != directionCursor:
                #dataKey = 'S'+str(directionCursor)+'-'+str( GloveClassifier.roundArbitrary(dataCursor, 600) )
                dataKey = 'S' + str(directionCursor) + '-' + str(dataCursor)
                linesentence = linesentence + " " + dataKey
                directionCursor = packet.getDirection()
                dataCursor = 0

            dataCursor += packet.getLength()

        if dataCursor > 0:
            key = 'S' + str(directionCursor) + '-' + str(dataCursor)
            linesentence = linesentence + " " + key
        myFile.write(linesentence)
        myFile.write("\n")

    myFile.close()
    if config.CLASSIFIER == config.GLOVE_CLASSIFIER:
        sentences = models.word2vec.LineSentence("sentences.txt")
        corpus = Corpus()

        corpus.fit(sentences, window=8)
        glove = Glove(no_components=25, learning_rate=0.05)
        glove.fit(corpus.matrix, epochs=100, no_threads=4, verbose=True)
        glove.add_dictionary(corpus.dictionary)
        glove.save("mygloveModel")

    elif config.CLASSIFIER == config.W2V_CLASSIFIER:
        txt = open("sentences.txt")
        # print txt.read()
        if (len(txt.read()) > 0):
            print "in here"
            txt.close()
            sentences = models.word2vec.LineSentence("sentences.txt")
            model = models.word2vec.Word2Vec(sentences,
                                             size=50,
                                             window=15,
                                             min_count=1,
                                             workers=4)
            model.save("word2vecModel")
        txt.close()