Example #1
0
def main():
    ###################################################################################
    ######################################load dataset#################################
    train_set = '/root/Text_Mining_Project/src/data/train.tsv'  #load dataset
    # save model to file
    model = 'sswe_model'  # model save file
    # vocab file
    sswe_vocabs = '/root/Text_Mining_Project/src/words.txt'  # vocab file
    # vectors file
    sswe_vectors = '/root/Text_Mining_Project/src/vectors.txt'  # vectors file model=model,train=train_set,vocab=vocab,vectors=vectors

    # sswe model
    #sswe=sswe_model(train=train_set,vocab=sswe_vocabs,vectors=sswe_vectors)
    #train sswe model
    #sswe_trainer(sswe)
    # load tweets
    reader = TweetReader(text_field=2, label_field=1, ngrams=2)
    reader.read(train_set)
    # load vocab
    base_vocab = reader.load_vocabulary(sswe_vocabs)
    # load embedding
    #vocab_file=None, vectors=None, vocab=None,variant=None
    #emb = Embeddings()
    #print emb.load_vocabulary(sswe_vocabs)

    embeddings = Embeddings(vocab_file=sswe_vocabs,
                            vectors=sswe_vectors,
                            vocab=base_vocab,
                            variant=None)

    # define converter
    converter = Converter()
    # add embeding to converter
    converter.add(embeddings)
    print reader.sentences
    # generate feature vectors for tweets
    converted_tweets = converter.generator(reader.sentences, cache=True)
    #for s in converted_tweets:
    #    print s.shape
    #pprint(converted_sentences)
    #for k in converted_tweets:
    #    print k.shape
    #sent= converter.generator([['I','am','happy']],cache=True)
    #for k in sent:
    #    print k
    return 0
Example #2
0
def sswe_trainer(model_parameters):
    # set the seed for replicability
    np.random.seed(42)
    # args = parser.parse_args()
    args = model_parameters
    log_format = '%(message)s'
    log_level = logging.DEBUG if args.verbose else logging.INFO
    log_level = logging.INFO
    logging.basicConfig(format=log_format, level=log_level)
    logger = logging.getLogger("Logger")

    config = ConfigParser()
    if args.config_file:
        config.read(args.config_file)
    # merge args with config
    reader = TweetReader(text_field=args.textField,
                         label_field=args.tagField,
                         ngrams=args.ngrams)
    reader.read(args.train)
    vocab, bigrams, trigrams = reader.create_vocabulary(
        reader.sentences, args.vocab_size, min_occurrences=args.minOccurr)
    #print("length vocab")
    #print(len(vocab))
    if args.variant == 'word2vec' and os.path.exists(args.vectors):
        embeddings = Embeddings(vectors=args.vectors, variant=args.variant)
        embeddings.merge(vocab)
        logger.info("Saving vocabulary in %s" % args.vocab)
        embeddings.save_vocabulary(args.vocab)
    elif os.path.exists(args.vocab):
        # start with the given vocabulary
        b_vocab = reader.load_vocabulary(args.vocab)
        bound = len(b_vocab) - len(bigrams) - len(trigrams)
        base_vocab = b_vocab[:bound]
        #print("length base vocab :")
        #print(len(base_vocab))
        if os.path.exists(args.vectors):
            # load embeddings
            embeddings = Embeddings(vectors=args.vectors,
                                    vocab=base_vocab,
                                    variant=args.variant)
        else:
            # create embeddings
            embeddings = Embeddings(args.embeddings_size,
                                    vocab=base_vocab,
                                    variant=args.variant)
            # add the ngrams from the corpus
            embeddings.merge(vocab)
            logger.info("Overriding vocabulary in %s" % args.vocab)
            embeddings.save_vocabulary(args.vocab)
    else:
        embeddings = Embeddings(args.embeddings_size,
                                vocab=vocab,
                                variant=args.variant)
        logger.info("Saving vocabulary in %s" % args.vocab)
        embeddings.save_vocabulary(args.vocab)

    # Assume bigrams are prefix of trigrams, or else we should put a terminator
    # on trie
    trie = {}
    for b in chain(bigrams, trigrams):
        tmp = trie
        for w in b:
            tmp = tmp.setdefault(embeddings.dict[w], {})

    converter = Converter()
    converter.add(embeddings)

    trainer = create_trainer(args, converter)

    report_intervals = max(args.iterations / 200, 1)
    report_intervals = 10000  # DEBUG

    logger.info("Starting training")

    # a generator expression (can be iterated several times)
    # It caches converted sentences, avoiding repeated conversions
    converted_sentences = converter.generator(reader.sentences, cache=True)
    trainer.train(converted_sentences, reader.polarities, trie,
                  args.iterations, report_intervals)

    logger.info("Overriding vectors to %s" % args.vectors)
    embeddings.save_vectors(args.vectors, args.variant)
    if args.model:
        logger.info("Saving trained model to %s" % args.model)
        trainer.save(args.model)
Example #3
0
                        action='store_true')

    args = parser.parse_args()

    log_format = '%(message)s'
    log_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(format=log_format, level=log_level)
    logger = logging.getLogger("Logger")

    config = ConfigParser()
    if args.config_file:
        config.read(args.config_file)

    # merge args with config

    reader = TweetReader(text_field=args.textField, label_field=args.tagField, ngrams=args.ngrams)
    reader.read(args.train)
    vocab, bigrams, trigrams = reader.create_vocabulary(reader.sentences,
                                                        args.vocab_size,
                                                        min_occurrences=args.minOccurr)
    if args.variant == 'word2vec' and os.path.exists(args.vectors):
        embeddings = Embeddings(vectors=args.vectors, variant=args.variant)
        embeddings.merge(vocab)
        logger.info("Saving vocabulary in %s" % args.vocab)
        embeddings.save_vocabulary(args.vocab)
    elif os.path.exists(args.vocab):
        # start with the given vocabulary
        base_vocab = reader.load_vocabulary(args.vocab)
        if os.path.exists(args.vectors):
            # load embeddings
            embeddings = Embeddings(vectors=args.vectors, vocab=base_vocab,
Example #4
0
                        action='store_true')

    args = parser.parse_args()

    log_format = '%(message)s'
    log_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(format=log_format, level=log_level)
    logger = logging.getLogger("Logger")

    config = ConfigParser()
    if args.config_file:
        config.read(args.config_file)

    # merge args with config

    reader = TweetReader(args.ngrams)
    reader.read(args.train)
    loaded_vocab = False
    if args.vocab and os.path.exists(args.vocab):
        loaded_vocab = True
        vocab = reader.load_vocabulary(args.vocab)
    else:
        vocab = reader.create_vocabulary(reader.sentences)
    tokens = []
    for l in vocab: tokens.extend(l) # flatten ngrams dictionaries
    embeddings = Embeddings(args.embeddings_size, vocab=tokens,
                            variant=args.variant)

    converter = Converter()
    converter.add_extractor(embeddings)
Example #5
0
                        action='store_true')

    args = parser.parse_args()

    log_format = '%(message)s'
    log_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(format=log_format, level=log_level)
    logger = logging.getLogger("Logger")

    config = ConfigParser()
    if args.config_file:
        config.read(args.config_file)

    # merge args with config

    reader = TweetReader(args.ngrams)
    reader.read(args.train)
    vocab, bigrams, trigrams = reader.create_vocabulary(reader.sentences,
                                                        min_occurrences=2)
    if os.path.exists(args.vocab):
        # start with the given vocabulary
        base_vocab = reader.load_vocabulary(args.vocab)
        if os.path.exists(args.vectors):
            embeddings = Embeddings(vectors=args.vectors,
                                    vocab=base_vocab,
                                    variant=args.variant)
        else:
            embeddings = Embeddings(args.embeddings_size,
                                    vocab=base_vocab,
                                    variant=args.variant)
        # add the ngrams from the corpus