Exemple #1
0
def create_corpora(corpus_fn):
    unigram_corpus = read_corpus(open(corpus_fn), skip=["#"])
    normalize_corpus(unigram_corpus)

    morpheme_corpus = read_corpus(open(corpus_fn), "#")
    normalize_corpus(morpheme_corpus)
    return unigram_corpus, morpheme_corpus
Exemple #2
0
def create_wfsa(options):
    # open output file or write to stdout
    output = open(options.output, "w") if options.output else sys.stdout

    # read initial transitions if given
    it = options.initial_transitions
    initial_transitions = Automaton.read_transitions(it) if it else {}

    # create uniform automaton with given number of states per letter
    # and the possibility of predefine some transitions
    if options.emitfile:
        numbers_per_letters = read_dict(open(options.emitfile))
        automaton = Automaton.create_uniform_automaton(numbers_per_letters, initial_transitions=initial_transitions)
        automaton.dump(output)
        if not options.smooth:
            automaton.smooth()
        return

    if options.numstate:
        input_ = sys.stdin
        corpus = read_corpus(input_, options.separator)
        alphabet = get_alphabet(corpus)
        numbers_per_letters = dict([(letter, options.numstate) for letter in alphabet])
        if options.num_epsilons:
            numbers_per_letters["EPSILON"] = options.num_epsilons

        automaton = Automaton.create_uniform_automaton(numbers_per_letters, initial_transitions)
        if options.smooth:
            automaton.smooth()
        automaton.dump(output)
        return

    if options.init_from_corpus:
        if len(initial_transitions) > 0:
            raise Exception(
                "Using initial transitions (-I option) when " + "creating automaton from corpus is not implemented"
            )
        input_ = open(options.init_from_corpus)
        corpus = read_corpus(input_, options.separator)
        corpus = normalize_corpus(corpus)
        automaton = Automaton.create_from_corpus(corpus)
        if options.smooth:
            automaton.smooth()
        automaton.dump(output)
        return

    # fallback
    logging.error("Options are not complete, something is missing to create " + "an Automaton")
    sys.exit(-1)
def main():
    quantizer = AbstractQuantizer.read(open(sys.argv[1]))
    corp = read_corpus(open(sys.argv[2]), separator="#")
    normalize_corpus(corp)
    probs = corp.values()
    dist = compute_entropy(probs, quantizer)
    print dist
Exemple #4
0
def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    parser = argparse.ArgumentParser(description='Train LDA model')
    parser.add_argument('--train', help='training corpus', required=True)
    parser.add_argument('--topics', help='number of topics', type=int, required=True)
    parser.add_argument('--iter', help='number of iterations', type=int, required=True)
    parser.add_argument('--pyp', help='use pyp priors', action='store_true')
    args = parser.parse_args()

    vocabulary = Vocabulary()

    logging.info('Reading training corpus')
    with open(args.train) as train:
        training_corpus = read_corpus(train, vocabulary)

    if args.pyp:
        logging.info('Using a PYP prior')
        doc_process = lambda: PYP(theta_doc, d_doc, Uniform(args.topics))
        topic_process = lambda: PYP(theta_topic, d_topic, Uniform(len(vocabulary)))
    else:
        logging.info('Using a Dirichlet prior')
        doc_process = lambda: DirichletMultinomial(args.topics, theta_doc)
        topic_process = lambda: DirichletMultinomial(len(vocabulary), theta_topic)

    model = TopicModel(args.topics, len(training_corpus), doc_process, topic_process) 

    logging.info('Training model with %d topics', args.topics)
    run_sampler(model, training_corpus, args.iter)
Exemple #5
0
def similarity(sent, topN=10):

    corpus_lines = read_corpus(ner_result_path)
    texts = [line.split("\t")[0].split(' ') for line in corpus_lines]

    keywords = one_ner_tag(sent)

    dictionary = corpora.Dictionary(texts)
    num_features = len(dictionary.token2id)
    corpus = [dictionary.doc2bow(text) for text in texts]
    tfidf = models.TfidfModel(corpus)

    new_vec = dictionary.doc2bow(keywords)
    # 相似度计算
    index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features)
    # index = similarities.Similarity('-Similarity-index', corpus, num_features)
    # print('\nTF-IDF模型的稀疏向量集:')
    # for i in tfidf[corpus]:
    #     print(i)
    # print('\nTF-IDF模型的keyword稀疏向量:')
    # print(tfidf[new_vec])

    sims = index[tfidf[new_vec]]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])

    print("\n相似度计算")
    print('Words: {}\nText: {}\n'.format(keywords, sent))

    for k, v in sims[:topN]:
        i = int(k)
        print('Similarity: {}\nWords: {}\nText: {}'.format(
            v, corpus_lines[i].split("\t")[0].split(' '),
            corpus_lines[i].split("\t")[1]))
def main():
    corpus = read_corpus(open(sys.argv[1]), separator="#")
    normalize_corpus(corpus)
    wfsa = create_word_wfsa(corpus)
    wfsa.finalize()
    if len(sys.argv) == 4:
        wfsa.quantizer = LogLinQuantizer(int(sys.argv[2]), int(sys.argv[3]))
        wfsa.round_and_normalize()
    wfsa.dump(sys.stdout)
Exemple #7
0
def main(args):
    model = load_model(args)
    print "loaded " + args.model

    raw_corpus = corpus.read_corpus(args.corpus)
    list_words, vocab_map, embeddings, padding_id = corpus.load_embeddings(corpus.load_embedding_iterator(args.embeddings))
    print("loaded embeddings")
    ids_corpus = corpus.map_corpus(vocab_map, raw_corpus)

    evaluation(args, padding_id, ids_corpus, vocab_map, embeddings, model)
Exemple #8
0
def main():
    # read automaton
    wfsa = Automaton.create_from_dump(open(sys.argv[1]))
    # read corpus
    corpus = read_corpus(open(sys.argv[2]), separator=sys.argv[3], skip=[sys.argv[4]])
    normalize_corpus(corpus)
    # call distance_from_corpus
    distances = {}
    dist = wfsa.distance_from_corpus(corpus, Automaton.kullback, distances=distances)
    # print out result
    for k, v in distances.iteritems():
        print k, v
Exemple #9
0
def main():
    automaton = Automaton.create_from_dump(open(sys.argv[1]))
    corpus = read_corpus(open(sys.argv[2]))
    normalize_corpus(corpus)
    entropy = float(sys.argv[3])
    string_bits = "u"
    if len(sys.argv) > 4:
        string_bits = sys.argv[4]
    q = LogLinQuantizer(10, -20)
    automaton.quantizer = q

    encoder = Encoder(entropy, string_bits)
    print encoder.encode(automaton, corpus)
def main():
    corpus = read_corpus(sys.stdin, separator="#")
    n_corpus = normalize_corpus(corpus)
    file_name = sys.argv[1]
    fsa_type = sys.argv[2]
    if fsa_type == 'plain':
        fsa_creator = lambda corpus: create_three_state_fsa(corpus)
    elif fsa_type == 'hogy':
        fsa_creator = lambda corpus: create_hogy_fsa(corpus)
    elif fsa_type == 'o':
        fsa_creator = lambda corpus: create_o_fsa(corpus)
    elif fsa_type == 'new':
        fsa_creator = lambda corpus: create_new_three_state_fsa(corpus, ["hogy", ("vala", "ki")], "m")
    else:
        logging.critical('unknown fsa type: {0}'.format(fsa_type))
        sys.exit(-1)
    
    create_wfsa(fsa_creator, file_name, n_corpus)
Exemple #11
0
def main(options):
    if not options.automaton_file:
        raise Exception("Automaton \"option\" (-a) is mandatory")
    automaton = Automaton.create_from_dump(open(options.automaton_file))

    if options.quantizer:
        automaton.quantizer = AbstractQuantizer.read(open(options.quantizer))
        automaton.round_and_normalize()

    input_ = sys.stdin
    if options.corpus:
        input_ = open(options.corpus)
    corpus = read_corpus(input_, options.separator)
    corpus = normalize_corpus(corpus)

    learner = Learner.create_from_options(automaton, corpus, options)
    learner.main()

    output = sys.stdout
    if options.output:
        output = open(options.output, "w")
    learner.automaton.dump(output)
Exemple #12
0
def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    parser = argparse.ArgumentParser(description='Train LDA model')
    parser.add_argument('--train', help='training corpus', required=True)
    parser.add_argument('--topics',
                        help='number of topics',
                        type=int,
                        required=True)
    parser.add_argument('--iter',
                        help='number of iterations',
                        type=int,
                        required=True)
    parser.add_argument('--pyp', help='use pyp priors', action='store_true')
    args = parser.parse_args()

    vocabulary = Vocabulary()

    logging.info('Reading training corpus')
    with open(args.train) as train:
        training_corpus = read_corpus(train, vocabulary)

    if args.pyp:
        logging.info('Using a PYP prior')
        doc_process = lambda: PYP(theta_doc, d_doc, Uniform(args.topics))
        topic_process = lambda: PYP(theta_topic, d_topic,
                                    Uniform(len(vocabulary)))
    else:
        logging.info('Using a Dirichlet prior')
        doc_process = lambda: DirichletMultinomial(args.topics, theta_doc)
        topic_process = lambda: DirichletMultinomial(len(vocabulary),
                                                     theta_topic)

    model = TopicModel(args.topics, len(training_corpus), doc_process,
                       topic_process)

    logging.info('Training model with %d topics', args.topics)
    run_sampler(model, training_corpus, args.iter)

def sequence_encoding(sequence, str_to_idx):
    """
    Transform list of strings into a tensor of integers to be processed by the pytorch model
    :param sequence: list of strings
    :param str_to_idx: dictionary that maps a string to a unique integer
    :return: pytorch tensor (vector) of long values
    """
    sequence_of_indexes = [str_to_idx[element] for element in sequence]
    return torch.tensor(sequence_of_indexes, dtype=torch.long)


# READ CORPUS, PREPARE DATA:

liste_X_train, liste_Y_train, liste_X_test, liste_Y_test, liste_X_dev, liste_Y_dev = read_corpus("sequoia-7.0/sequoia.deep.conll", 0.8, 0.2, 0)
liste_X_whole_corpus, liste_Y_whole_corpus, _, _, _, _ = read_corpus("sequoia-7.0/sequoia.deep.conll", 1, 0, 0)
print("len(liste_X_train): ", len(liste_X_train))
print("len(liste_X_test): ", len(liste_X_test), " len(liste_X_dev): ", len(liste_X_dev))
print("first element (x,y) train:  x = ", liste_X_train[1], ", y = ", liste_Y_train[1])

"""
import json
with open("pos_data_sequoia.txt", 'w', encoding="utf-8") as pos_data_sequoia_file:
    sequoia_pos_json = json.dumps({"train_data":{"X":liste_X_train,"Y":liste_Y_train},"test_data":{"X":liste_X_test,"Y":liste_Y_test},"dev_data":{"X":liste_X_dev,"Y":liste_Y_dev}})
    pos_data_sequoia_file.write(sequoia_pos_json)
"""

# Create dictionaries
tag_to_idx = create_dict_str_to_idx(liste_Y_whole_corpus)
word_to_idx = create_dict_str_to_idx(liste_X_whole_corpus)
Exemple #14
0
def gensim_tags():
    ckpt_file = tf.train.latest_checkpoint(model_path)
    #print(ckpt_file)
    paths['model_path'] = ckpt_file
    model = BiLSTM_CRF(args,
                       embeddings,
                       tag2label,
                       word2id,
                       paths,
                       config=config)
    model.build_graph()
    saver = tf.train.Saver()

    classifier = fasttext.load_model(classifier_model_path)

    with tf.Session(config=config) as sess:
        print('============= ner_tags =============')
        saver.restore(sess, ckpt_file)

        ner_result_fb = open(ner_result_path, 'a+')

        corpus_lines = read_corpus(ner_corpus_path)
        for ner_line in corpus_lines:
            ner_line = ner_line.strip()
            ner_line = re.sub(r'\s+', '', ner_line)
            ner_line = re.sub(r'日(凌晨|早晨|上午|中午|下午|晚上|深夜)+', '日', ner_line)
            line_data = list(ner_line.replace(' ', '', 10).strip())
            line_data = [(line_data, ['O'] * len(line_data))]
            tags = model.get_ner_tag(sess, line_data)
            Location, Time, Means, Thing = get_entity(tags, ner_line)
            print('Location: {}\nTime: {}\nMeans: {}\nThing: {}'.format(
                Location, Time, Means, Thing))

            words = []
            if len(Time) > 0:
                time_info = get_date_info(Time[0])
                print(time_info)
                if time_info is None and len(Time) > 1:
                    time_info = get_date_info(Time[1])
                print(time_info)
                words += list(time_info)
            else:
                print('NoTime {}'.format(ner_line))
                words.append('NoTime')

            if len(Location) > 0:
                location_info = getlnglat(Location[0])
                print(location_info)
                words.append(location_info)
            else:
                print('NoLocation {}'.format(ner_line))
                words.append('NoLocation')

            if len(Means) > 0:
                words += Means
            else:
                print('NoMeans {}'.format(ner_line))
                words.append('NoMeans')

            if len(Thing) > 0:
                category = classifier.predict(Thing, k=1)[0][0][0].replace(
                    '__label__', '')
                print(category)
                words.append(category)
            else:
                print('NoClass {}'.format(ner_line))
                words.append('NoClass')

            print(words)
            ner_result_fb.write(' '.join(words) + "\t" + ner_line + "\n")

        ner_result_fb.close()
Exemple #15
0
    '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
    datefmt='%a, %d %b %Y %H:%M:%S',
    filename='./logs/post.log',
    filemode='w',
)

console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)

logging.info('Reading training corpus')
vocabulary = Vocabulary()
with open(args["train_file"]) as train:
    corpus = read_corpus(train, vocabulary)

# tag_models = [PYPLM(args["tag_order"], initial_base=Uniform(args["n_tags"])) for _ in range(args["n_particles"])]
# word_models = [PYPLM(args["word_order"], initial_base=Uniform(len(vocabulary))) for _ in range(args["n_particles"])]

tag_models = [PYPLM(args["tag_order"], initial_base=Uniform(args["n_tags"]))]
word_models = [
    PYPLM(args["word_order"], initial_base=Uniform(len(vocabulary)))
]

logging.info('Training model of order %d', args["tag_order"])

ll_list, ppl_list = run_sentence_sampler(corpus,
                                         word_models,
                                         tag_models,
                                         n_tags=args["n_tags"],
Exemple #16
0
import config
import utils
import corpus
import machine_learning


if __name__ == '__main__':
    """
    Entry point for app
    """

    # create log-files and write headers
    utils.write_resultlog_headers()

    # read and preprocess corpus
    corpus = corpus.read_corpus(config.corpus_path)

    # run main programm
    if config.use_all_variants == False:
        machine_learning.run(corpus)

    else:   
        # gather all possible feature combinations
        f_combinations = utils.get_feature_combos()
        count = 1

        # run main programm for all combinations
        for combo in f_combinations:
            config.feature_selection = combo

            print("\nRunning configuration {} of {}".format(count, len(f_combinations)))
Exemple #17
0
import ne_chunker
import corpus
from nltk import pos_tag, word_tokenize
from nltk.chunk import conlltags2tree, tree2conlltags

# The path to the used corpus, here I used large dataset corpus named Groningen Meaning Bank
corpus_root = 'gmb-2.2.0'
mode = '--core'

data = corpus.read_corpus(corpus_root, mode)

training_samples = data[:int(len(data) * 0.9)]
test_samples = data[int(len(data) * 0.9):]

print "#training samples = %s" % len(
    training_samples)  # training samples = 55809
print "#test samples = %s" % len(test_samples)  # test samples = 6201

chunker = ne_chunker.NamedEntityChunker(training_samples[:55809])
# text = "Cristiano Ronaldo is a decent footballer both in Real Madrid, Spain and Manchester United, United Kingdom. He is truly a masterpiece."
text = "Geraldi Dzakwan wakes up at 7 am every morning."
print chunker.parse(pos_tag(word_tokenize(text)))

score = chunker.evaluate([
    conlltags2tree([(w, t, iob) for (w, t), iob in iobs])
    for iobs in test_samples[:500]
])

# Debugging
print score.accuracy()
# 0.931132334092
def main():
    automaton = Automaton.create_from_dump(open(sys.argv[1]))
    corpus = read_corpus(open(sys.argv[2]), "#")
    dc = DistanceCache(automaton, corpus)
    dc.build_paths()
Exemple #19
0
def main(args):
    time1 = datetime.now()
    raw_corpus = corpus.read_corpus(args.corpus)
    list_words, vocab_map, embeddings, padding_id = corpus.load_embeddings(
        corpus.load_embedding_iterator(args.embeddings))
    print("loaded embeddings")
    ids_corpus = corpus.map_corpus(vocab_map, raw_corpus)
    annotations = corpus.read_annotations(args.train)
    print("got annotations")

    training_batches = corpus.create_batches(ids_corpus, annotations,
                                             args.batch_size, padding_id)
    print("got batches")

    time2 = datetime.now()
    print "time to preprocess: " + str(time2 - time1)

    if args.model == 'cnn':
        args.margin = 0.2

    if args.load_model:
        if args.model == 'lstm':
            print("loading " + args.load_model)
            lstm = nn.LSTM(input_size=args.embedding_size,
                           hidden_size=args.hidden_size)
            lstm.load_state_dict(torch.load(args.load_model))
            optimizer = Adam(lstm.parameters())
            if args.cuda:
                lstm.cuda()
        else:
            print("loading " + args.load_model)
            cnn = nn.Conv1d(in_channels=args.embedding_size,
                            out_channels=args.hidden_size,
                            kernel_size=3,
                            padding=1)
            cnn.load_state_dict(torch.load(args.load_model))
            optimizer = Adam(cnn.parameters())
            if args.cuda:
                cnn.cuda()
    else:
        if args.model == 'lstm':
            print "training lstm"
            lstm = nn.LSTM(input_size=args.embedding_size,
                           hidden_size=args.hidden_size)
            optimizer = Adam(lstm.parameters())
            if args.cuda:
                lstm.cuda()
        else:
            print "training cnn"
            cnn = nn.Conv1d(in_channels=args.embedding_size,
                            out_channels=args.hidden_size,
                            kernel_size=3,
                            padding=1)
            optimizer = Adam(cnn.parameters())
            if args.cuda:
                cnn.cuda()

    if args.save_model:
        if args.model == 'lstm':
            lstm_model_nums = []
            for d in os.listdir("lstm_models"):
                if "lstm_model" in d:
                    num = int(d[len("lstm_models") - 1:])
                    lstm_model_nums.append(num)
            if len(lstm_model_nums) > 0:
                new_model_num = max(lstm_model_nums) + 1
            else:
                new_model_num = 0
            print("creating new model " + "lstm_models/lstm_model" +
                  str(new_model_num))
            os.makedirs("lstm_models/lstm_model" + str(new_model_num))
        else:
            cnn_model_nums = []
            for d in os.listdir("cnn_models"):
                if "cnn_model" in d:
                    num = int(d[len("cnn_models") - 1:])
                    cnn_model_nums.append(num)
            if len(cnn_model_nums) > 0:
                new_model_num = max(cnn_model_nums) + 1
            else:
                new_model_num = 0
            print("creating new model " + "cnn_models/cnn_model" +
                  str(new_model_num))
            os.makedirs("cnn_models/cnn_model" + str(new_model_num))

    # lstm tutorial: http://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
    # lstm documentation: http://pytorch.org/docs/master/nn.html?highlight=nn%20lstm#torch.nn.LSTM

    count = 1
    hidden_states = []
    total_loss = 0.0
    time_begin = datetime.now()
    for epoch in range(10):
        print "epoch = " + str(epoch)
        for batch in training_batches:
            optimizer.zero_grad()
            if count % 10 == 0:
                print(count)
                print "average loss: " + str((total_loss / float(count)))
                print("time for 10 batches: " +
                      str(datetime.now() - time_begin))
                time_begin = datetime.now()
            titles, bodies, triples = batch
            title_length, title_num_questions = titles.shape
            body_length, body_num_questions = bodies.shape
            title_embeddings, body_embeddings = corpus.get_embeddings(
                titles, bodies, vocab_map, embeddings)

            # title
            if args.model == 'lstm':
                if args.cuda:
                    title_inputs = [
                        autograd.Variable(
                            torch.FloatTensor(title_embeddings).cuda())
                    ]
                    title_inputs = torch.cat(title_inputs).view(
                        title_length, title_num_questions, -1)
                    # title_inputs = torch.cat(title_inputs).view(title_num_questions, title_length, -1)

                    title_hidden = (autograd.Variable(
                        torch.zeros(1, title_num_questions,
                                    args.hidden_size).cuda()),
                                    autograd.Variable(
                                        torch.zeros(
                                            (1, title_num_questions,
                                             args.hidden_size)).cuda()))
                else:
                    title_inputs = [
                        autograd.Variable(torch.FloatTensor(title_embeddings))
                    ]
                    title_inputs = torch.cat(title_inputs).view(
                        title_length, title_num_questions, -1)

                    title_hidden = (autograd.Variable(
                        torch.zeros(1, title_num_questions, args.hidden_size)),
                                    autograd.Variable(
                                        torch.zeros((1, title_num_questions,
                                                     args.hidden_size))))
            else:
                if args.cuda:
                    title_inputs = [
                        autograd.Variable(
                            torch.FloatTensor(title_embeddings).cuda())
                    ]
                else:
                    title_inputs = [
                        autograd.Variable(torch.FloatTensor(title_embeddings))
                    ]
                title_inputs = torch.cat(title_inputs).transpose(0,
                                                                 1).transpose(
                                                                     1, 2)

            if args.model == 'lstm':
                title_out, title_hidden = lstm(title_inputs, title_hidden)
            else:
                title_out = cnn(title_inputs)
                title_out = F.tanh(title_out)
                title_out = title_out.transpose(1, 2).transpose(0, 1)

            # average all words of each question from title_out
            # title_out (max sequence length) x (batch size) x (hidden size)
            average_title_out = average_questions(title_out, titles,
                                                  padding_id)

            # body
            if args.model == 'lstm':
                if args.cuda:
                    body_inputs = [
                        autograd.Variable(
                            torch.FloatTensor(body_embeddings).cuda())
                    ]
                    body_inputs = torch.cat(body_inputs).view(
                        body_length, body_num_questions, -1)
                    # body_inputs = torch.cat(body_inputs).view(body_num_questions, body_length, -1)

                    body_hidden = (autograd.Variable(
                        torch.zeros(1, body_num_questions,
                                    args.hidden_size).cuda()),
                                   autograd.Variable(
                                       torch.zeros((1, body_num_questions,
                                                    args.hidden_size)).cuda()))
                else:
                    body_inputs = [
                        autograd.Variable(torch.FloatTensor(body_embeddings))
                    ]
                    body_inputs = torch.cat(body_inputs).view(
                        body_length, body_num_questions, -1)

                    body_hidden = (autograd.Variable(
                        torch.zeros(1, body_num_questions, args.hidden_size)),
                                   autograd.Variable(
                                       torch.zeros((1, body_num_questions,
                                                    args.hidden_size))))
            else:
                if args.cuda:
                    body_inputs = [
                        autograd.Variable(
                            torch.FloatTensor(body_embeddings).cuda())
                    ]
                else:
                    body_inputs = [
                        autograd.Variable(torch.FloatTensor(body_embeddings))
                    ]
                body_inputs = torch.cat(body_inputs).transpose(0, 1).transpose(
                    1, 2)

            if args.model == 'lstm':
                body_out, body_hidden = lstm(body_inputs, body_hidden)
            else:
                body_out = cnn(body_inputs)
                body_out = F.tanh(body_out)
                body_out = body_out.transpose(1, 2).transpose(0, 1)

            average_body_out = average_questions(body_out, bodies, padding_id)
            count += 1

            # average body and title
            # representations of the questions as found by the LSTM
            hidden = (average_title_out + average_body_out) * 0.5
            if args.cuda:
                triples_vectors = hidden[torch.LongTensor(
                    triples.ravel()).cuda()]
            else:
                triples_vectors = hidden[torch.LongTensor(triples.ravel())]
            triples_vectors = triples_vectors.view(triples.shape[0],
                                                   triples.shape[1],
                                                   args.hidden_size)

            query = triples_vectors[:, 0, :].unsqueeze(1)
            examples = triples_vectors[:, 1:, :]

            cos_similarity = F.cosine_similarity(query, examples, dim=2)
            if args.cuda:
                targets = autograd.Variable(
                    torch.zeros(triples.shape[0]).type(
                        torch.LongTensor).cuda())
            else:
                targets = autograd.Variable(
                    torch.zeros(triples.shape[0]).type(torch.LongTensor))
            # outputs a Variable
            # By default, the losses are averaged over observations for each minibatch
            if args.cuda:
                loss = F.multi_margin_loss(cos_similarity,
                                           targets,
                                           margin=args.margin).cuda()
            else:
                loss = F.multi_margin_loss(cos_similarity,
                                           targets,
                                           margin=args.margin)
            total_loss += loss.cpu().data.numpy()[0]
            loss.backward()

            optimizer.step()

        result_headers = ['Epoch', 'MAP', 'MRR', 'P@1', 'P@5']
        with open(os.path.join(sys.path[0], args.results_file),
                  'a') as evaluate_file:
            writer = csv.writer(evaluate_file, dialect='excel')
            writer.writerow(result_headers)

        if args.model == 'lstm':
            evaluation(args, padding_id, ids_corpus, vocab_map, embeddings,
                       lstm, epoch)
        else:
            evaluation(args, padding_id, ids_corpus, vocab_map, embeddings,
                       cnn, epoch)

        if args.save_model:
            # saving the model
            if args.model == 'lstm':
                print "Saving lstm model epoch " + str(
                    epoch) + " to lstm_model" + str(new_model_num)
                torch.save(
                    lstm.state_dict(), "lstm_models/lstm_model" +
                    str(new_model_num) + '/' + "epoch" + str(epoch))
            else:
                print "Saving cnn model epoch " + str(
                    epoch) + " to cnn_model" + str(new_model_num)
                torch.save(
                    cnn.state_dict(), "cnn_models/cnn_model" +
                    str(new_model_num) + '/' + "epoch" + str(epoch))
Exemple #20
0
def main(args):
    """This file performs domain transfer using an adversarial discriminative network. Example usage:

    python adversarial_domain.py --ubuntu_path ../askubuntu --android_path ../Android --embeddings ../glove.pruned.txt.gz"""

    ubuntu_corpus = os.path.join(args.ubuntu_path, 'text_tokenized.txt.gz')
    android_corpus = os.path.join(args.android_path, 'corpus.tsv.gz')
    ubuntu_raw_corpus = corpus.read_corpus(ubuntu_corpus)
    android_raw_corpus = corpus.read_corpus(android_corpus)
    list_words, vocab_map, embeddings, padding_id = corpus.load_embeddings(
        corpus.load_embedding_iterator(args.embeddings))
    print "loaded embeddings"

    ubuntu_ids_corpus = corpus.map_corpus(vocab_map, ubuntu_raw_corpus)
    android_ids_corpus = corpus.map_corpus(vocab_map, android_raw_corpus)
    ubuntu_train = os.path.join(args.ubuntu_path, 'train_random.txt')
    ubuntu_train_annotations = corpus.read_annotations(ubuntu_train)
    print len(ubuntu_train_annotations)
    ubuntu_training_batches = corpus.create_batches(ubuntu_ids_corpus,
                                                    ubuntu_train_annotations,
                                                    args.batch_size,
                                                    padding_id)
    print "got ubuntu batches"

    if args.load_model:
        if args.model == 'lstm':
            print("loading " + args.load_model)
            lstm = nn.LSTM(input_size=300, hidden_size=args.hidden_size)
            lstm.load_state_dict(torch.load(args.load_model))
            optimizer = Adam(lstm.parameters())
            if args.cuda:
                lstm.cuda()
        else:
            print("loading " + args.load_model)
            cnn = nn.Conv1d(in_channels=300,
                            out_channels=args.hidden_size,
                            kernel_size=3,
                            padding=1)
            cnn.load_state_dict(torch.load(args.load_model))
            optimizer = Adam(cnn.parameters())
            if args.cuda:
                cnn.cuda()
    else:
        if args.model == 'lstm':
            print "training lstm"
            lstm = nn.LSTM(input_size=300, hidden_size=args.hidden_size)
            optimizer = Adam(lstm.parameters())
            if args.cuda:
                lstm.cuda()
        else:
            print "training cnn"
            cnn = nn.Conv1d(in_channels=300,
                            out_channels=args.hidden_size,
                            kernel_size=3,
                            padding=1)
            optimizer = Adam(cnn.parameters())
            if args.cuda:
                cnn.cuda()

    feed_forward = FeedForward(args)
    if args.cuda:
        feed_forward.cuda()
    feed_forward_optimizer = Adam(feed_forward.parameters(), lr=-0.001)

    android_dev_pos_path = os.path.join(args.android_path, 'dev.pos.txt')
    android_dev_neg_path = os.path.join(args.android_path, 'dev.neg.txt')
    android_dev_annotations = android_pairs_to_annotations(
        android_dev_pos_path, android_dev_neg_path)

    count = 1
    hidden_states = []
    total_encoder_loss = 0.0
    total_domain_loss = 0.0
    total_loss = 0.0
    time_begin = datetime.now()
    time_begin_epoch = datetime.now()
    for epoch in range(20):
        print "epoch = " + str(epoch)
        for batch in ubuntu_training_batches:

            titles, bodies, triples = batch

            optimizer.zero_grad()
            if count % 10 == 0:
                print(count)
                print "average encoder loss: " + str(
                    (total_encoder_loss / float(count)))
                print "average domain loss: " + str(
                    (total_domain_loss / float(count)))
                print "average loss: " + str((total_loss / float(count)))
                print("time for 10 batches: " +
                      str(datetime.now() - time_begin))
                time_begin = datetime.now()
            count += 1

            ubuntu_batch = corpus.domain_classifier_batch(
                ubuntu_ids_corpus, ubuntu_train_annotations, padding_id)
            ubuntu_titles, ubuntu_bodies, _ = ubuntu_batch
            android_batch = corpus.domain_classifier_batch(
                android_ids_corpus, android_dev_annotations, padding_id)
            android_titles, android_bodies, _ = android_batch

            # print "shapes"
            # print ubuntu_titles.shape
            # print android_titles.shape

            if args.model == 'lstm':
                model = lstm
            else:
                model = cnn

            hidden_ubuntu = vectorize_question(args, batch, model, vocab_map,
                                               embeddings, padding_id)
            hidden_ubuntu_domain = vectorize_question(args, ubuntu_batch,
                                                      model, vocab_map,
                                                      embeddings, padding_id)
            hidden_android_domain = vectorize_question(args, android_batch,
                                                       model, vocab_map,
                                                       embeddings, padding_id)
            hidden_combined = torch.cat(
                (hidden_ubuntu_domain, hidden_android_domain))
            input_size = int(hidden_combined.size()[0])

            output = feed_forward.forward(hidden_combined)

            domain_labels = [1] * int(hidden_ubuntu_domain.size(
            )[0]) + [0] * int(hidden_android_domain.size()[0])
            if args.cuda:
                domain_labels = autograd.Variable(
                    torch.LongTensor(domain_labels).cuda())
            else:
                domain_labels = autograd.Variable(
                    torch.LongTensor(domain_labels))

            if args.cuda:
                triples_vectors = hidden_ubuntu[torch.LongTensor(
                    triples.ravel()).cuda()]
            else:
                triples_vectors = hidden_ubuntu[torch.LongTensor(
                    triples.ravel())]
            triples_vectors = triples_vectors.view(triples.shape[0],
                                                   triples.shape[1],
                                                   args.hidden_size)

            query = triples_vectors[:, 0, :].unsqueeze(1)
            examples = triples_vectors[:, 1:, :]
            cos_similarity = F.cosine_similarity(query, examples, dim=2)
            if args.cuda:
                targets = autograd.Variable(
                    torch.zeros(triples.shape[0]).type(
                        torch.LongTensor).cuda())
            else:
                targets = autograd.Variable(
                    torch.zeros(triples.shape[0]).type(torch.LongTensor))
            if args.cuda:
                encoder_loss = F.multi_margin_loss(cos_similarity,
                                                   targets,
                                                   margin=args.margin).cuda()
            else:
                encoder_loss = F.multi_margin_loss(cos_similarity,
                                                   targets,
                                                   margin=args.margin)
            total_encoder_loss += encoder_loss.cpu().data.numpy()[0]

            # if args.cuda:
            #     domain_loss_func = nn.CrossEntropyLoss().cuda()
            # else:
            #     domain_loss_func = nn.CrossEntropyLoss()
            # domain_classifier_loss = domain_loss_func(output, domain_labels)
            if args.cuda:
                domain_classifier_loss = F.cross_entropy(
                    output, domain_labels).cuda()
            else:
                domain_classifier_loss = F.cross_entropy(output, domain_labels)
            total_domain_loss += domain_classifier_loss.cpu().data.numpy()[0]

            combined_loss = encoder_loss - args.lam * domain_classifier_loss
            total_loss += combined_loss.cpu().data.numpy()[0]
            combined_loss.backward()

            optimizer.step()
            feed_forward_optimizer.step()

        print "time for one epoch: " + str(datetime.now() - time_begin_epoch)
        time_begin_epoch = datetime.now()
        evaluation(args, padding_id, android_ids_corpus, model, vocab_map,
                   embeddings)