コード例 #1
0
def eval_seq_train(gold, pre, labels, hc = None, features = None):
    """
    evaluate a sequence labeler
    """
    n = len(gold)
    tp = 0
    fp = 0
    fn = 0

    for i in range(n):
        (x, y, z) = eval_ner(gold[i], pre[i], labels)
        tp += x
        fp += y
        fn += z
        if hc != None:
            if y + z > 0:
                sen = hc.sentences[i]
                print "sen: ",  util.get_words(sen, features) + " OOV = " + str(has_oov(sen))
                print "true labels: ", util.get_lab_name(gold[i], labels)
                print "predicted: ", util.get_lab_name(pre[i], labels)

    try:
        pre = tp * 1.0 / (tp + fp)
        rec = tp * 1.0 / (tp + fn)
        f = 2.0 * pre * rec / (pre + rec)
        print pre, rec, f
    except ZeroDivisionError:
        print "DIV BY 0 ", tp, fp, fn
コード例 #2
0
ファイル: data_processing.py プロジェクト: Zendom88/BestBuy
def correct_query(raw_query, lemmatizer, local_cache):
    raw_query = raw_query.lower().strip()
    if raw_query in local_cache:
        return local_cache[raw_query]
    words = get_words(raw_query)
    new_words = list()
    for w in words:
        split = get_split(w)
        if type(split) == type(()):
            new_words.extend(list(split))
        else:
            new_words.append(split)
    new_query = ''
    for w in new_words:
        lemma = lemmatizer.lemmatize(w)
        if len(lemma) >= 4 and not lemma.isdigit() and not lemma.isalpha():
            split = split_word_num(w)
            if type(split) == type(()):
                w, num = split
                lemma = ' '.join([w, num])
        new_query += lemma + ' '
    new_query = new_query[0:-1]
    local_cache[raw_query] = new_query
    #print("New query: {}".format(new_query.encode("utf-8")))
    return new_query
コード例 #3
0
ファイル: main.py プロジェクト: raziakram/solutions
def make_predictions(st_line, ed_line, out_file, pname, models):
    cat_count, item_count, item_sort, alpha, beta, item_word, bigram_item_word, time_cat_item_dict, cat_word, hot_words = models[
        0]
    reader = readfile(new_test_file)
    writer = writefile(out_file)
    line_idx = 0
    for (user, category, raw_query, click_time) in reader:
        line_idx += 1
        if line_idx < st_line:
            continue
        if line_idx > ed_line:
            break
        if line_idx % TEST_STEP == 0:
            print '%s--%d' % (pname, line_idx / TEST_STEP)
        time_block = get_time_feature(click_time)
        try:
            bound = cat_count[category][PREDICT_HOT_SIZE]
            hots = [x[0] for x in item_sort[category][0:bound]]
        except:
            writer.writerow(["0"])
            continue
        try:
            bigram = get_bigram_word(raw_query, hot_words, category)
            words = get_words(raw_query)
            query_size = sum([cat_word[category][w] for w in words])
            if query_size >= 100 and len(bigram) > 0:
                'only queries hot enough and can generate bigram features can be predicted by boosting model'
                rank = [[
                    sku,
                    boosting_bayes(bigram, words, category, sku, alpha, beta,
                                   item_word, bigram_item_word, item_count,
                                   cat_count, time_cat_item_dict, time_block)
                ] for sku in hots]
            elif query_size >= 100 and len(bigram) == 0:
                'if hot enough but can not generate bigram features then use naive bayes with time information'
                rank = [[
                    sku,
                    time_bayes_query_prediction(words, category, sku, alpha,
                                                beta, item_word, item_count,
                                                cat_count, time_cat_item_dict,
                                                time_block)
                ] for sku in hots]
            else:
                'otherwise use plain naive bayes'
                rank = [[
                    sku,
                    plain_bayes_query_prediction(words, category, sku, alpha,
                                                 beta, item_word, item_count,
                                                 cat_count)
                ] for sku in hots]
            rank = sorted(rank, key=lambda x: x[1], reverse=True)
            guesses = [i[0] for i in rank[0:5]]
            guesses = rerank_guess(guesses, user, raw_query)

            writer.writerow([" ".join(guesses)])
        except (TypeError, KeyError):  # a category we haven't seen before
            writer.writerow([" ".join(hots[0:5])])
コード例 #4
0
    def get_title_words(self, dedupe=True):
        """Get the words (lowercase) from the title.

        Args:
            dedupe: Flag indicating if only unique words should be returned. If True, only unique
                words will be returned (in no particular order). If False, all words found will
                be returned in original order with duplicates.
        Returns:
            Iterable over strings representing the words found in the title.
        """
        return util.get_words(self.get_title(), dedupe=dedupe)
コード例 #5
0
ファイル: views.py プロジェクト: markaurelius/verbquest
def song_search(request):
    query = request.path.split("/")[-1]
    res = song.search(sort='song_hotttnesss-desc',combined=query,results=1,buckets=['id:lyricfind-US'], limit=True)[0]
    lid = res.get_foreign_id('lyricfind-US').split(":")[-1]
    lyrics = util.get_lyrics(lid)
    words = util.clean_lyrics(lyrics)
    words = util.get_words(words)
    words = util.remove_common_verbs(words)
    sorted_lyric_data = sorted(words.iteritems(), key=itemgetter(1), reverse=True)[0:10]
    sorted_lyric_counts = [x[1] for x in sorted_lyric_data]
    sorted_lyrics = [x[0] for x in sorted_lyric_data]
    response = {'artist' : res.artist_name, 'verbs' : lyrics, 'answer' : res.title, 'sorted_verbs' : sorted_lyrics, 'sorted_verb_counts' : sorted_lyric_counts}
    return render_to_response("song_results.html",response)
コード例 #6
0
    def query():
        """Query for prototypical articles within a topic (using "search" url param).

        Returns:
            JSON listing of prototypical records for the given topic.
        """
        query_string = flask.request.args.get('search')
        keywords = util.get_words(query_string)
        report_maybe('query', query_string)
        records = records_keep.query(keywords)
        records_serial = list(
            sorted(map(model.serialize_record_to_dict, records),
                   key=lambda x: x['source']))
        return json.dumps({'records': records_serial})
コード例 #7
0
ファイル: main.py プロジェクト: raziakram/solutions
def get_unigram_model(item_sort, cat_count):
    reader = readfile(new_train_file)
    item_word = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    cat_word = defaultdict(lambda: defaultdict(int))
    idx = 0
    for (__user, sku, category, raw_query, ___click_time) in reader:
        idx += 1
        bound = cat_count[category][HOT_SIZE]
        popular = [i[0] for i in item_sort[category][0:bound]]
        if sku in popular:
            words = get_words(raw_query)
            for w in words:
                item_word[category][sku][w] += magic_num
                cat_word[category][w] += magic_num
    return item_word, cat_word
コード例 #8
0
ファイル: main.py プロジェクト: harixxy/solutions
def get_unigram_model(item_sort, cat_count):
    reader = readfile(new_train_file)
    item_word = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    cat_word = defaultdict(lambda: defaultdict(int))
    idx = 0
    for (__user, sku, category, raw_query, ___click_time) in reader:
        idx += 1
        bound = cat_count[category][HOT_SIZE]
        popular = [i[0] for i in item_sort[category][0:bound]]
        if sku in popular:
            words = get_words(raw_query)
            for w in words:
                item_word[category][sku][w] += magic_num
                cat_word[category][w] += magic_num
    return item_word, cat_word
コード例 #9
0
ファイル: views.py プロジェクト: markaurelius/verbquest
def api_song_search(request):
    query = request.path.split("/")[-1]
    try:
        res = song.search(combined=query,results=1,buckets=['id:lyricfind-US'], limit=True)[0]
        lid = res.get_foreign_id('lyricfind-US').split(":")[-1]
        lyrics = util.get_lyrics(lid)
        words = util.clean_lyrics(lyrics)
        words = util.get_words(words)
        words = util.remove_common_words(words)
        sorted_lyric_data = sorted(words.iteritems(), key=itemgetter(1), reverse=False)[-20:]
        sorted_lyric_counts = [x[1] for x in sorted_lyric_data]
        sorted_lyrics = [x[0] for x in sorted_lyric_data]
        sorted_lyric_combined = [[x[1],x[0]] for x in sorted_lyric_data]
        response = {'artist' : res.artist_name, 'title' : res.title, 'sorted_words' : sorted_lyric_combined}
    except Exception, e:
        response = {'error' : str(e)}
コード例 #10
0
def eval_hc_test(hc, features, labels, print_err=False, decoder='hc'):
    """
    evaluate in the train set
    :param hc:
    :param labels:
    :return:
    """
    tp = 0
    fp = 0
    fn = 0

    dirname = "testa"
    input = []
    for file in os.listdir(dirname):
        # print file
        if file.endswith(".txt"):
            f = open(os.path.join(dirname, file))
            l = list(f)
            input.extend(l)
            f.close()
    # return input
    sentences = util.extract(input, features, labels, keep_word = True)

    # return sentences

    for sen in sentences:
        if True:
            # if not has_oov(sen):
            #predicted = hc.decode(util.get_obs(sen))
            predicted = get_tag(hc, sen, features, decoder)
            (x, y, z) = eval_ner(util.get_lab(sen), predicted, labels)
            tp += x
            fp += y
            fn += z
            if print_err:
                if y + z > 0:
                    print "sen: ",  util.get_words(sen, features) + " OOV = " + str(has_oov(sen))
                    print "true labels: ", util.get_lab_name(util.get_lab(sen), labels)
                    print "predicted: ", util.get_lab_name(predicted, labels)

    try:
        pre = tp * 1.0 / (tp + fp)
        rec = tp * 1.0 / (tp + fn)
        f = 2.0 * pre * rec / (pre + rec)
        print pre, rec, f
    except ZeroDivisionError:
        print "DIV BY 0 ", tp, fp, fn
コード例 #11
0
ファイル: main.py プロジェクト: harixxy/solutions
def make_predictions(st_line, ed_line, out_file, pname, models):
    cat_count, item_count, item_sort, alpha, beta, item_word, bigram_item_word, time_cat_item_dict, cat_word, hot_words = models[0]
    reader = readfile(new_test_file)
    writer = writefile(out_file)
    line_idx = 0
    for (user, category, raw_query, click_time) in reader:
        line_idx += 1
        if line_idx < st_line:
            continue
        if line_idx > ed_line:
            break
        if line_idx % TEST_STEP == 0:
            print '%s--%d' % (pname, line_idx / TEST_STEP)
        time_block = get_time_feature(click_time)
        try:
            bound = cat_count[category][PREDICT_HOT_SIZE]
            hots = [x[0] for x in item_sort[category][0:bound]]
        except:
            writer.writerow(["0"])
            continue
        try:
            bigram = get_bigram_word(raw_query, hot_words, category)
            words = get_words(raw_query)
            query_size = sum([cat_word[category][w] for w in words])
            if query_size >= 100 and len(bigram) > 0:
                'only queries hot enough and can generate bigram features can be predicted by boosting model'
                rank = [[sku, boosting_bayes(bigram, words, category, sku, alpha, beta, item_word, bigram_item_word, item_count, cat_count, time_cat_item_dict, time_block)] for sku in hots]
            elif query_size >= 100 and len(bigram) == 0:
                'if hot enough but can not generate bigram features then use naive bayes with time information'
                rank = [[sku, time_bayes_query_prediction(words, category, sku, alpha, beta, item_word, item_count, cat_count, time_cat_item_dict, time_block)] for sku in hots]
            else:
                'otherwise use plain naive bayes'
                rank = [[sku, plain_bayes_query_prediction(words, category, sku, alpha, beta, item_word, item_count, cat_count)] for sku in hots]
            rank = sorted(rank, key=lambda x:x[1], reverse=True)
            guesses = [i[0] for i in rank[0:5]]
            guesses = rerank_guess(guesses, user, raw_query)
            
            writer.writerow([" ".join(guesses)])
        except (TypeError, KeyError): # a category we haven't seen before
            writer.writerow([" ".join(hots[0:5])])
コード例 #12
0
ファイル: data_processing.py プロジェクト: harixxy/solutions
def correct_query(raw_query, lemmatizer, local_cache):
    raw_query = raw_query.lower().strip()
    if raw_query in local_cache:
        return local_cache[raw_query]
    words = get_words(raw_query)
    new_words = list()
    for w in words:
        split = get_split(w)
        if type(split) == type(()):
            new_words.extend(list(split))
        else:
            new_words.append(split)
    new_query = ''
    for w in new_words:
        lemma = lemmatizer.lemmatize(w)
        if len(lemma) >= 4 and not lemma.isdigit() and not lemma.isalpha():
            split = split_word_num(w)
            if type(split) == type(()):
                w, num = split
                lemma = ' '.join([w, num])
        new_query += lemma + ' '
    new_query = new_query[0:-1]
    local_cache[raw_query] = new_query
    return new_query
コード例 #13
0
        #		if embedding == 1:
        #			print h
        #			if h==56:
        #				print "vvww"
        #		h = h + 1
        #	print word
        #print ("len: ", x)
        answer_one_hot = [
            0 for i in range(largest_num_of_words_any_paragraph + 2)
        ]
        answer_one_hot[largest_num_of_words_any_paragraph] = 1
        answer_lookup_dict['unk'] = answer_one_hot
        answer_one_hot = [
            0 for i in range(largest_num_of_words_any_paragraph + 2)
        ]
        answer_one_hot[largest_num_of_words_any_paragraph + 1] = 1
        answer_lookup_dict[''] = answer_one_hot
        feed_dict = {question: questions_words, text: paragraphs_sentences}
        classification = sess.run(answer_softmax, feed_dict)
        print util.get_words(classification, answer_lookup_dict,
                             largest_num_of_words_in_answer)

    while True:
        proceed = raw_input(
            "Do you want to ask another question (q), enter new paragraph (p) or exit (e): "
        )
        if proceed not in ['p', 'q', 'e']:
            print "Invalid input"
        else:
            break
コード例 #14
0
ファイル: main.py プロジェクト: tkondrashov/thisminute
			LEFT JOIN tweet_properties ON id = tweet_id
			WHERE TO_TIMESTAMP(%s) <= time AND time < TO_TIMESTAMP(%s)
			ORDER BY time ASC
		""", (last_runtime, current_time))

    last_runtime = current_time

    if current_time - last_traintime >= 30:
        pericog.update()
        last_traintime = current_time

    ids = []
    X = []
    for id, timestamp, geolocation, exact, user, text in db_tweets_cursor.fetchall(
    ):
        if not get_words(text):
            continue

        ids.append(id)
        X.append(text)

        db_tweets_cursor.execute(
            """
				INSERT INTO tweet_votes
					(tweet_id, user_ip, disaster)
				VALUES
					(%s, '0.0.0.0', False)
			""", (id, ))

    if X:
        Y = pericog.predict(X)
コード例 #15
0
ファイル: main.py プロジェクト: harixxy/solutions
def get_bigram_word(raw_query, hot_words, cat):
    words = get_words(raw_query)
    words = [w for w in words if w in hot_words[cat]]
    words.sort()
    bigram = get_pair(words)
    return bigram
コード例 #16
0
 def test_get_words(self):
     words = util.get_words('test sentence 1.', False)
     self.assertEquals(len(words), 3)
     self.assertEquals(words[0], 'test')
     self.assertEquals(words[1], 'sentence')
     self.assertEquals(words[2], '1')
コード例 #17
0
 def test_get_words_special_chars(self):
     words = util.get_words('test sentence-1.', False)
     self.assertEquals(len(words), 2)
     self.assertEquals(words[0], 'test')
     self.assertEquals(words[1], 'sentence-1')
コード例 #18
0
ファイル: main.py プロジェクト: raziakram/solutions
def get_bigram_word(raw_query, hot_words, cat):
    words = get_words(raw_query)
    words = [w for w in words if w in hot_words[cat]]
    words.sort()
    bigram = get_pair(words)
    return bigram
コード例 #19
0
ファイル: models.py プロジェクト: jbochi/to-a-pe
 def searchable_words(self):
     return get_words(self.search_text())