Python get_wordsの例、util.get_words Pythonの例

コード例 #1

0

ファイルを表示

def eval_seq_train(gold, pre, labels, hc = None, features = None):
    """
    evaluate a sequence labeler
    """
    n = len(gold)
    tp = 0
    fp = 0
    fn = 0

    for i in range(n):
        (x, y, z) = eval_ner(gold[i], pre[i], labels)
        tp += x
        fp += y
        fn += z
        if hc != None:
            if y + z > 0:
                sen = hc.sentences[i]
                print "sen: ",  util.get_words(sen, features) + " OOV = " + str(has_oov(sen))
                print "true labels: ", util.get_lab_name(gold[i], labels)
                print "predicted: ", util.get_lab_name(pre[i], labels)

    try:
        pre = tp * 1.0 / (tp + fp)
        rec = tp * 1.0 / (tp + fn)
        f = 2.0 * pre * rec / (pre + rec)
        print pre, rec, f
    except ZeroDivisionError:
        print "DIV BY 0 ", tp, fp, fn

コード例 #2

0

ファイルを表示

ファイル: data_processing.py プロジェクト: Zendom88/BestBuy

def correct_query(raw_query, lemmatizer, local_cache):
    raw_query = raw_query.lower().strip()
    if raw_query in local_cache:
        return local_cache[raw_query]
    words = get_words(raw_query)
    new_words = list()
    for w in words:
        split = get_split(w)
        if type(split) == type(()):
            new_words.extend(list(split))
        else:
            new_words.append(split)
    new_query = ''
    for w in new_words:
        lemma = lemmatizer.lemmatize(w)
        if len(lemma) >= 4 and not lemma.isdigit() and not lemma.isalpha():
            split = split_word_num(w)
            if type(split) == type(()):
                w, num = split
                lemma = ' '.join([w, num])
        new_query += lemma + ' '
    new_query = new_query[0:-1]
    local_cache[raw_query] = new_query
    #print("New query: {}".format(new_query.encode("utf-8")))
    return new_query

コード例 #3

0

ファイルを表示

ファイル: main.py プロジェクト: raziakram/solutions

def make_predictions(st_line, ed_line, out_file, pname, models):
    cat_count, item_count, item_sort, alpha, beta, item_word, bigram_item_word, time_cat_item_dict, cat_word, hot_words = models[
        0]
    reader = readfile(new_test_file)
    writer = writefile(out_file)
    line_idx = 0
    for (user, category, raw_query, click_time) in reader:
        line_idx += 1
        if line_idx < st_line:
            continue
        if line_idx > ed_line:
            break
        if line_idx % TEST_STEP == 0:
            print '%s--%d' % (pname, line_idx / TEST_STEP)
        time_block = get_time_feature(click_time)
        try:
            bound = cat_count[category][PREDICT_HOT_SIZE]
            hots = [x[0] for x in item_sort[category][0:bound]]
        except:
            writer.writerow(["0"])
            continue
        try:
            bigram = get_bigram_word(raw_query, hot_words, category)
            words = get_words(raw_query)
            query_size = sum([cat_word[category][w] for w in words])
            if query_size >= 100 and len(bigram) > 0:
                'only queries hot enough and can generate bigram features can be predicted by boosting model'
                rank = [[
                    sku,
                    boosting_bayes(bigram, words, category, sku, alpha, beta,
                                   item_word, bigram_item_word, item_count,
                                   cat_count, time_cat_item_dict, time_block)
                ] for sku in hots]
            elif query_size >= 100 and len(bigram) == 0:
                'if hot enough but can not generate bigram features then use naive bayes with time information'
                rank = [[
                    sku,
                    time_bayes_query_prediction(words, category, sku, alpha,
                                                beta, item_word, item_count,
                                                cat_count, time_cat_item_dict,
                                                time_block)
                ] for sku in hots]
            else:
                'otherwise use plain naive bayes'
                rank = [[
                    sku,
                    plain_bayes_query_prediction(words, category, sku, alpha,
                                                 beta, item_word, item_count,
                                                 cat_count)
                ] for sku in hots]
            rank = sorted(rank, key=lambda x: x[1], reverse=True)
            guesses = [i[0] for i in rank[0:5]]
            guesses = rerank_guess(guesses, user, raw_query)

            writer.writerow([" ".join(guesses)])
        except (TypeError, KeyError):  # a category we haven't seen before
            writer.writerow([" ".join(hots[0:5])])

コード例 #4

0

ファイルを表示

ファイル: model.py プロジェクト: datadrivenempathy/who-wrote-this-server

    def get_title_words(self, dedupe=True):
        """Get the words (lowercase) from the title.

        Args:
            dedupe: Flag indicating if only unique words should be returned. If True, only unique
                words will be returned (in no particular order). If False, all words found will
                be returned in original order with duplicates.
        Returns:
            Iterable over strings representing the words found in the title.
        """
        return util.get_words(self.get_title(), dedupe=dedupe)

コード例 #5

0

ファイルを表示

ファイル: views.py プロジェクト: markaurelius/verbquest

def song_search(request):
    query = request.path.split("/")[-1]
    res = song.search(sort='song_hotttnesss-desc',combined=query,results=1,buckets=['id:lyricfind-US'], limit=True)[0]
    lid = res.get_foreign_id('lyricfind-US').split(":")[-1]
    lyrics = util.get_lyrics(lid)
    words = util.clean_lyrics(lyrics)
    words = util.get_words(words)
    words = util.remove_common_verbs(words)
    sorted_lyric_data = sorted(words.iteritems(), key=itemgetter(1), reverse=True)[0:10]
    sorted_lyric_counts = [x[1] for x in sorted_lyric_data]
    sorted_lyrics = [x[0] for x in sorted_lyric_data]
    response = {'artist' : res.artist_name, 'verbs' : lyrics, 'answer' : res.title, 'sorted_verbs' : sorted_lyrics, 'sorted_verb_counts' : sorted_lyric_counts}
    return render_to_response("song_results.html",response)

コード例 #6

0

ファイルを表示

ファイル: application.py プロジェクト: datadrivenempathy/who-wrote-this-server

    def query():
        """Query for prototypical articles within a topic (using "search" url param).

        Returns:
            JSON listing of prototypical records for the given topic.
        """
        query_string = flask.request.args.get('search')
        keywords = util.get_words(query_string)
        report_maybe('query', query_string)
        records = records_keep.query(keywords)
        records_serial = list(
            sorted(map(model.serialize_record_to_dict, records),
                   key=lambda x: x['source']))
        return json.dumps({'records': records_serial})

コード例 #7

0

ファイルを表示

ファイル: main.py プロジェクト: raziakram/solutions

def get_unigram_model(item_sort, cat_count):
    reader = readfile(new_train_file)
    item_word = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    cat_word = defaultdict(lambda: defaultdict(int))
    idx = 0
    for (__user, sku, category, raw_query, ___click_time) in reader:
        idx += 1
        bound = cat_count[category][HOT_SIZE]
        popular = [i[0] for i in item_sort[category][0:bound]]
        if sku in popular:
            words = get_words(raw_query)
            for w in words:
                item_word[category][sku][w] += magic_num
                cat_word[category][w] += magic_num
    return item_word, cat_word

コード例 #8

0

ファイルを表示

ファイル: main.py プロジェクト: harixxy/solutions

def get_unigram_model(item_sort, cat_count):
    reader = readfile(new_train_file)
    item_word = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    cat_word = defaultdict(lambda: defaultdict(int))
    idx = 0
    for (__user, sku, category, raw_query, ___click_time) in reader:
        idx += 1
        bound = cat_count[category][HOT_SIZE]
        popular = [i[0] for i in item_sort[category][0:bound]]
        if sku in popular:
            words = get_words(raw_query)
            for w in words:
                item_word[category][sku][w] += magic_num
                cat_word[category][w] += magic_num
    return item_word, cat_word

コード例 #9

0

ファイルを表示

ファイル: views.py プロジェクト: markaurelius/verbquest

def api_song_search(request):
    query = request.path.split("/")[-1]
    try:
        res = song.search(combined=query,results=1,buckets=['id:lyricfind-US'], limit=True)[0]
        lid = res.get_foreign_id('lyricfind-US').split(":")[-1]
        lyrics = util.get_lyrics(lid)
        words = util.clean_lyrics(lyrics)
        words = util.get_words(words)
        words = util.remove_common_words(words)
        sorted_lyric_data = sorted(words.iteritems(), key=itemgetter(1), reverse=False)[-20:]
        sorted_lyric_counts = [x[1] for x in sorted_lyric_data]
        sorted_lyrics = [x[0] for x in sorted_lyric_data]
        sorted_lyric_combined = [[x[1],x[0]] for x in sorted_lyric_data]
        response = {'artist' : res.artist_name, 'title' : res.title, 'sorted_words' : sorted_lyric_combined}
    except Exception, e:
        response = {'error' : str(e)}

コード例 #10

0

ファイルを表示

def eval_hc_test(hc, features, labels, print_err=False, decoder='hc'):
    """
    evaluate in the train set
    :param hc:
    :param labels:
    :return:
    """
    tp = 0
    fp = 0
    fn = 0

    dirname = "testa"
    input = []
    for file in os.listdir(dirname):
        # print file
        if file.endswith(".txt"):
            f = open(os.path.join(dirname, file))
            l = list(f)
            input.extend(l)
            f.close()
    # return input
    sentences = util.extract(input, features, labels, keep_word = True)

    # return sentences

    for sen in sentences:
        if True:
            # if not has_oov(sen):
            #predicted = hc.decode(util.get_obs(sen))
            predicted = get_tag(hc, sen, features, decoder)
            (x, y, z) = eval_ner(util.get_lab(sen), predicted, labels)
            tp += x
            fp += y
            fn += z
            if print_err:
                if y + z > 0:
                    print "sen: ",  util.get_words(sen, features) + " OOV = " + str(has_oov(sen))
                    print "true labels: ", util.get_lab_name(util.get_lab(sen), labels)
                    print "predicted: ", util.get_lab_name(predicted, labels)

    try:
        pre = tp * 1.0 / (tp + fp)
        rec = tp * 1.0 / (tp + fn)
        f = 2.0 * pre * rec / (pre + rec)
        print pre, rec, f
    except ZeroDivisionError:
        print "DIV BY 0 ", tp, fp, fn

コード例 #11

0

ファイルを表示

ファイル: main.py プロジェクト: harixxy/solutions

def make_predictions(st_line, ed_line, out_file, pname, models):
    cat_count, item_count, item_sort, alpha, beta, item_word, bigram_item_word, time_cat_item_dict, cat_word, hot_words = models[0]
    reader = readfile(new_test_file)
    writer = writefile(out_file)
    line_idx = 0
    for (user, category, raw_query, click_time) in reader:
        line_idx += 1
        if line_idx < st_line:
            continue
        if line_idx > ed_line:
            break
        if line_idx % TEST_STEP == 0:
            print '%s--%d' % (pname, line_idx / TEST_STEP)
        time_block = get_time_feature(click_time)
        try:
            bound = cat_count[category][PREDICT_HOT_SIZE]
            hots = [x[0] for x in item_sort[category][0:bound]]
        except:
            writer.writerow(["0"])
            continue
        try:
            bigram = get_bigram_word(raw_query, hot_words, category)
            words = get_words(raw_query)
            query_size = sum([cat_word[category][w] for w in words])
            if query_size >= 100 and len(bigram) > 0:
                'only queries hot enough and can generate bigram features can be predicted by boosting model'
                rank = [[sku, boosting_bayes(bigram, words, category, sku, alpha, beta, item_word, bigram_item_word, item_count, cat_count, time_cat_item_dict, time_block)] for sku in hots]
            elif query_size >= 100 and len(bigram) == 0:
                'if hot enough but can not generate bigram features then use naive bayes with time information'
                rank = [[sku, time_bayes_query_prediction(words, category, sku, alpha, beta, item_word, item_count, cat_count, time_cat_item_dict, time_block)] for sku in hots]
            else:
                'otherwise use plain naive bayes'
                rank = [[sku, plain_bayes_query_prediction(words, category, sku, alpha, beta, item_word, item_count, cat_count)] for sku in hots]
            rank = sorted(rank, key=lambda x:x[1], reverse=True)
            guesses = [i[0] for i in rank[0:5]]
            guesses = rerank_guess(guesses, user, raw_query)
            
            writer.writerow([" ".join(guesses)])
        except (TypeError, KeyError): # a category we haven't seen before
            writer.writerow([" ".join(hots[0:5])])

コード例 #12

0

ファイルを表示

ファイル: data_processing.py プロジェクト: harixxy/solutions

def correct_query(raw_query, lemmatizer, local_cache):
    raw_query = raw_query.lower().strip()
    if raw_query in local_cache:
        return local_cache[raw_query]
    words = get_words(raw_query)
    new_words = list()
    for w in words:
        split = get_split(w)
        if type(split) == type(()):
            new_words.extend(list(split))
        else:
            new_words.append(split)
    new_query = ''
    for w in new_words:
        lemma = lemmatizer.lemmatize(w)
        if len(lemma) >= 4 and not lemma.isdigit() and not lemma.isalpha():
            split = split_word_num(w)
            if type(split) == type(()):
                w, num = split
                lemma = ' '.join([w, num])
        new_query += lemma + ' '
    new_query = new_query[0:-1]
    local_cache[raw_query] = new_query
    return new_query

コード例 #13

0

ファイルを表示

        #		if embedding == 1:
        #			print h
        #			if h==56:
        #				print "vvww"
        #		h = h + 1
        #	print word
        #print ("len: ", x)
        answer_one_hot = [
            0 for i in range(largest_num_of_words_any_paragraph + 2)
        ]
        answer_one_hot[largest_num_of_words_any_paragraph] = 1
        answer_lookup_dict['unk'] = answer_one_hot
        answer_one_hot = [
            0 for i in range(largest_num_of_words_any_paragraph + 2)
        ]
        answer_one_hot[largest_num_of_words_any_paragraph + 1] = 1
        answer_lookup_dict[''] = answer_one_hot
        feed_dict = {question: questions_words, text: paragraphs_sentences}
        classification = sess.run(answer_softmax, feed_dict)
        print util.get_words(classification, answer_lookup_dict,
                             largest_num_of_words_in_answer)

    while True:
        proceed = raw_input(
            "Do you want to ask another question (q), enter new paragraph (p) or exit (e): "
        )
        if proceed not in ['p', 'q', 'e']:
            print "Invalid input"
        else:
            break

コード例 #14

0

ファイルを表示

ファイル: main.py プロジェクト: tkondrashov/thisminute

			LEFT JOIN tweet_properties ON id = tweet_id
			WHERE TO_TIMESTAMP(%s) <= time AND time < TO_TIMESTAMP(%s)
			ORDER BY time ASC
		""", (last_runtime, current_time))

    last_runtime = current_time

    if current_time - last_traintime >= 30:
        pericog.update()
        last_traintime = current_time

    ids = []
    X = []
    for id, timestamp, geolocation, exact, user, text in db_tweets_cursor.fetchall(
    ):
        if not get_words(text):
            continue

        ids.append(id)
        X.append(text)

        db_tweets_cursor.execute(
            """
				INSERT INTO tweet_votes
					(tweet_id, user_ip, disaster)
				VALUES
					(%s, '0.0.0.0', False)
			""", (id, ))

    if X:
        Y = pericog.predict(X)

コード例 #15

0

ファイルを表示

ファイル: main.py プロジェクト: harixxy/solutions

def get_bigram_word(raw_query, hot_words, cat):
    words = get_words(raw_query)
    words = [w for w in words if w in hot_words[cat]]
    words.sort()
    bigram = get_pair(words)
    return bigram

コード例 #16

0

ファイルを表示

ファイル: util_test.py プロジェクト: datadrivenempathy/who-wrote-this-server

 def test_get_words(self):
     words = util.get_words('test sentence 1.', False)
     self.assertEquals(len(words), 3)
     self.assertEquals(words[0], 'test')
     self.assertEquals(words[1], 'sentence')
     self.assertEquals(words[2], '1')

コード例 #17

0

ファイルを表示

ファイル: util_test.py プロジェクト: datadrivenempathy/who-wrote-this-server

 def test_get_words_special_chars(self):
     words = util.get_words('test sentence-1.', False)
     self.assertEquals(len(words), 2)
     self.assertEquals(words[0], 'test')
     self.assertEquals(words[1], 'sentence-1')

コード例 #18

0

ファイルを表示

ファイル: main.py プロジェクト: raziakram/solutions

def get_bigram_word(raw_query, hot_words, cat):
    words = get_words(raw_query)
    words = [w for w in words if w in hot_words[cat]]
    words.sort()
    bigram = get_pair(words)
    return bigram

コード例 #19

0

ファイルを表示

ファイル: models.py プロジェクト: jbochi/to-a-pe

 def searchable_words(self):
     return get_words(self.search_text())