Beispiel #1
0
def create_all_indi_word_clouds(infold, outfold, sub='tweets'):
    stopwords = utils.get_stopwords(sub)
    outfold = os.path.join(outfold, sub, 'indi')
    os.makedirs(outfold, exist_ok=True)
    for i, city in enumerate(config.CITIES + ['other']):
        print(city)
        create_indi_word_cloud(infold, outfold, sub, city, stopwords)
Beispiel #2
0
def grid_search_dim_red(*args):
    # Dummy search.
    stop_words = get_stopwords()
    n_estimators = (30, 40, 50)
    max_depths = [5, 7, 10, 20]
    criterions = ('gini', 'entropy')
    pca_params = [7, 14, 21, 30]
    i = 1
    for max_depth in max_depths:
        for criterion in criterions:
            for n_estimator in n_estimators:
                for pca_param in pca_params:
                    print(
                        f'{i}. {max_depth}, {criterion}, {n_estimator}, {pca_param}.'
                    )
                    vectorizer = TfidfVectorizer(stop_words=stop_words,
                                                 max_df=.7)
                    clf = RandomForestClassifier(n_estimators=n_estimator,
                                                 criterion=criterion,
                                                 max_depth=max_depth)
                    print(
                        grid_search_with_dim_red_one(clf,
                                                     vectorizer,
                                                     *args,
                                                     n=pca_param))
                    i += 1
Beispiel #3
0
def get_text_ranks():
    segmenter = Segmenter()
    stopwords = get_stopwords()
    print("Start TextRank over the selected quatrains ...")
    corpus = get_corpus()
    adjlist = dict()
    for idx, poem in enumerate(corpus):
        if 0 == (idx + 1) % 10000:
            print("[TextRank] Scanning %d/%d poems ..." %
                  (idx + 1, len(corpus)))
        for sentence in poem['sentence']:
            segs = list(
                filter(lambda word: word not in stopwords,
                       segmenter.segment(sentence)))
            for seg in segs:
                if seg not in adjlist:
                    adjlist[seg] = dict()

            for i, seg in enumerate(segs):
                for _, other in enumerate(segs[i + 1:]):
                    if seg != other:
                        adjlist[seg][other] = adjlist[seg][other] + 1 \
                            if other in adjlist[seg] else 1.0
                        adjlist[other][seg] = adjlist[other][seg] + 1 \
                            if seg in adjlist[other] else 1.0

    for word in adjlist:
        w_sum = sum(weight for other, weight in adjlist[word].items())
        for other in adjlist[word]:
            adjlist[word][other] /= w_sum
    print("[TextRank] Weighted graph has been built.")
    _text_rank(adjlist)
Beispiel #4
0
def get_distribution_other(locfile):
    stopwords = utils.get_stopwords()
    freqs = {}
    with open(locfile, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) < 2:
                continue
            freqs[parts[0]] = int(parts[1])

    freqs = utils.filter_stopwords(freqs, stopwords, filter_unprintable=True)

    cloud = wordcloud.WordCloud(
        background_color="white").generate_from_frequencies(freqs)

    fig = plt.figure(figsize=(10, 5))
    plt.imshow(cloud, interpolation='bilinear')
    plt.axis("off")
    plt.title('Locations of Other', fontsize=15)
    plt.show()
    plt.clf()
    plt.close()


# get_distribution_other('loc.vocab')
# rank_city_by_uniqueness('tweets', 'images/unique', thres1=5,thres2=5, at_least_ratio=2, results=2)
# rank_city_by_uniqueness('bios', 'images/unique', thres1=2,thres2=2, at_least_ratio=2, results=2)
# get_popuplarity_distribution('images/pop')
Beispiel #5
0
def remove_stopwords(sub, d='processed'):
    nlp = spacy.load('en_core_web_sm')
    stopwords = utils.get_stopwords()

    infold = d + '/tokenized'
    outfold = d + '/cleaned'

    subinfold = os.path.join(infold, sub)
    suboutfold = os.path.join(outfold, sub)
    os.makedirs(suboutfold, exist_ok=True)

    files = glob.glob(subinfold + '/*.tok')
    for file in files:
        filename = file[file.rfind('/') + 1:]

        out = open(os.path.join(suboutfold, filename), 'w')
        with open(file, 'r') as f:
            for line in f:
                line = line.strip().replace("’", "'")
                line = line.replace("’", "'")
                line = line.replace("“", '"')
                line = line.replace('”', '"')
                raw_tokens = line.split()
                tokens = [
                    token for token in raw_tokens
                    if len(token) > 0 and not token.lower() in stopwords
                ]
                if len(tokens) > 0:
                    out.write(' '.join(tokens) + '\n')

        out.close()
Beispiel #6
0
def get_wordvectors():
    print("wordvectors read ...")
    word_dict = utils.get_word_counts("/../resources/yelp/data/yelp_restaurant_word_counts.txt")
    stop_words = utils.get_stopwords()
    vw_model = utils.get_word2vec_model('../resources/yelp/word2vec/yelp_restaurants_word2vector', ncols, nwin)
    vw_model.syn0 = utils.normalize2(vw_model.syn0)
    glove_dict = utils.get_glove_data('../resources/yelp/glove/','vectors_'+str(ncols)+'.txt')
    return vw_model, word_dict, stop_words, glove_dict
Beispiel #7
0
 def __init__(self, flags):
     # 加载模型
     self.model = get_model(flags.max_len, flags.vocab_size,
                            flags.embedding_dim, flags.lstm_unit,
                            flags.dropout_loss_rate, flags.label_num)
     self.model.load_weights(flags.weight_save_path)
     # 预加载处理评价数据库
     self.stopwords = get_stopwords(flags.stopwords_file)
     self.w2i, _ = read_vocab(flags.vocab_file)
     with open(flags.label_file, 'r') as f:
         self.labels = [l.strip() for l in f.readlines()]
     self.classify = ['Not mention', 'Bad', 'Normal', 'Good']
    def __init__(self, **kargs):
        """Initialize, load Glove embeddings."""
        super(GloVeSemanticSimilarityMetric, self).__init__()

        get_nltk_data()
        self._glove = get_glove_emb()
        stopwords = get_stopwords()
        logger.info("Glove embeddings and stopwords loaded.")

        for word in stopwords:
            word = word.lower().strip()
            if word in self._glove["tok2id"]:
                self._glove["emb_table"][self._glove["tok2id"][word], :] = 0
Beispiel #9
0
def search_synonym (query):
    try:
        solr = connect_solr()
        list_words = ViTokenizer.tokenize(query).split()
        stopwords = utils.get_stopwords()

        words = [] # word after remove stop word
        for word in list_words:
            if word not in stopwords:
                words.append(word)
        
        
    except Exception:
        print("[ERROR] search synoym error: Something went wrong!")
Beispiel #10
0
def create_all_duo_word_clouds(infold, outfold, sub):
    stopwords = utils.get_stopwords(sub)
    outfold = os.path.join(outfold, sub, 'duo')
    os.makedirs(outfold, exist_ok=True)

    # for i, city1 in enumerate(config.CITIES + ['other']):
    # 	for j, city2 in enumerate(config.CITIES + ['other']):

    stopwords.add('favorite')
    stopwords.add('favourite')
    stopwords.add('mom')
    stopwords.add('mum')
    for i, city1 in enumerate(['nyc']):
        for j, city2 in enumerate(['melbourne']):
            if i <= j:
                create_duo_word_clouds(infold, outfold, sub, city1, city2,
                                       stopwords)
def sentences2idx(texts, words):
    """
  Take in data, output array of word indices that can be fed into the algorithms.
  :param texts: List of texts
  :return: x1, m1. x1[i, :] is the word indices in sentence i, m1[i,:] is the mask for sentence i (0 means no word at the location)
  """
    seq = []
    for t in texts:
        # Doing some cleaning of the text
        stopwords = utils.get_stopwords()
        text = t.strip().strip('"')
        text_clean = utils.clean_text(text)
        s = [w for w in text_clean.split(" ") if w not in stopwords]
        s = s[0:MAX_WORDS]
        seq.append(data_io.getSeq(' '.join(s), words))
    x1, m1 = data_io.prepare_data(seq)
    return x1, m1
Beispiel #12
0
def get_tf(text):
    stopdict = utils.get_stopwords()
    tf = {}
    # seg = utils.chinese_segment.get_segment(text)
    word_count = 1
    for word in text.split(' '):
        if not stopdict.__contains__(word) and word.strip().__len__() > 1:
            word_count += 1
            if tf.__contains__(word):
                tf[word] += 1
            else:
                tf[word] = 1
    rs = sorted(iteritems(tf), key=lambda d: d[1], reverse=True)
    rs = [(a[0], a[1] / float(word_count)) for a in rs]
    if rs.__len__() > 20:
        return rs[:20]
    else:
        return rs
Beispiel #13
0
def search (query, page=1):
    try:
        solr = connect_solr()
        list_words = ViTokenizer.tokenize(query).split()
        stopwords = utils.get_stopwords()
        words = [] # word after remove stop word
        for word in list_words:
            if word not in stopwords:
                words.append(word)
        if len(words) == 0:
            return { "results": [], "numFound": 0 }
        else:            
            clean_query = ' '.join(words)
            page = int(page)
            results = solr.search("content_clean:{}".format(clean_query), **{'fl': '*, score', 'start': "{}".format((page - 1)*10)})
        return { "results": results, "numFound": results.raw_response['response']['numFound']}
    except Exception:
        print("[ERROR] search error: Something went wrong!")
Beispiel #14
0
def vanilla_experiment(dataset, classifiers, tf_idf_params, nlp=NLP):
    train_df = read_data(dataset['train'])
    dev_df = read_data(dataset['dev'])

    stop_words = list(get_stopwords())

    X_train, y_train, X_dev, y_dev = get_train_dev_data(train_df, dev_df, nlp)
    print(len(X_train), len(X_dev), flush=True)
    best_clfs = grid_search(classifiers,
                            tf_idf_params,
                            X_train,
                            y_train,
                            stop_words=stop_words)
    test_on_dev(best_clfs, X_train, y_train, X_dev, y_dev)
    conf_train, conf_dev, labels = get_confusion_matrix(
        best_clfs[0], X_train, y_train, X_dev, y_dev)
    pprint(conf_train)
    pprint(conf_dev)
    return best_clfs
    def __init__(self, dataset, model_init):
        self._tokenizer = BertTokenizer.from_pretrained(
            utils.get_transformers(model_init), do_lower_case="uncased" in model_init)

        self._glove = get_glove_emb()
        stopwords = get_stopwords()

        for word in stopwords:
            word = word.lower().strip()
            if word in self._glove["tok2id"]:
                self._glove["emb_table"][self._glove["tok2id"][word], :] = 0

        data = []
        logger.info("processing data for wordpiece embedding training")
        for item in tqdm.tqdm(dataset["data"]):
            text = item["text0"]
            if "text1" in item:
                text += " " + item["text1"]

            text_toks = word_tokenize(text)
            data += [x for x in text_toks if x.lower() in self._glove["tok2id"]]
        self._data = data
Beispiel #16
0
def processing_data(infile, labelfile, outfile, vocab_file, stopwords_file):
    print('Loading stopwords...')
    stopwords = get_stopwords(stopwords_file)

    print('Loading data...')
    data = pd.read_csv(infile)

    print('Saving labels')
    with open(labelfile, 'w') as f:
        for label in data.columns[2:]:
            f.write(label + '\n')

    # 把句子分割成词
    print('Splitting content')
    contents = data['content'].tolist()
    seg_contents = segmentData(contents, stopwords)

    if not os.path.exists(vocab_file):
        print('Creating vocabulary...')
        create_vocab(seg_contents, vocab_file, 50000)

    print('Loading vocabulary...')
    w2i, _ = read_vocab(vocab_file)

    # word2id
    print('Tokenize...')
    token_contents = [tokenizer(c, w2i) for c in seg_contents]
    data['content'] = token_contents

    # 把标签转换成one hot形式
    print('One-hot label')
    for col in data.columns[2:]:
        label = data[col].tolist()
        onehot_label = [onehot(l) for l in label]
        data[col] = onehot_label

    print('Saving...')
    data[data.columns[1:]].to_csv(outfile, index=False)
Beispiel #17
0
def calculate(raw_data, tag):
    '''tag:原始新闻属于哪一个信息来源'''
    """获取原始新闻数据"""
    if not raw_data == {}:
        entityId_list = []  # 新闻ID
        train_titles = []
        publishDateTime_list = []
        count = 0
        for k, v in raw_data.items():
            if v.get('content') is None or v.get('content') == '':
                continue
            count += 1
            entityId_list.append(k)

            if v.get('title') is None or v.get('title') == '':
                train_titles.append(v.get('content'))
            else:
                train_titles.append(v.get('content') + ' ' + v.get('title'))
            publishDateTime_list.append(v.get('publishDateTime'))

        print "get raw data done!"
        """聚类分析"""
        hotId2newsId = {}  #热点对应的新闻ID
        hotId_to_be_write = {}
        start = datetime.now()
        train_titles = [
            chinese_segment.get_segment(title) for title in train_titles
        ]
        train_data = [model_train.get_tf(d) for d in train_titles]

        print 'seg take time %s' % (datetime.now() - start).seconds
        '''第一次聚类,将原始数据聚类为若干热点'''
        start = datetime.now()
        first_cluster_result = model_train.first_cluster(
            train_data, entityId_list, publishDateTime_list)
        print 'first_cluster_result take time %s' % (datetime.now() -
                                                     start).seconds
        old_hotId_words = utils.get_hotId_words(tag)

        hotId_newsId = []  #热点对应的新闻ID
        '''
        当前新闻热点与已有新闻热点比较,如果该热点不在已有热点中则,插入该热点,否则跳过

        '''
        start = datetime.now()
        for first_result in first_cluster_result:
            terms = model_train.get_tf_from_list(first_result)  #热点下面的关键词
            hotId, hotIdflag = utils.get_hot_id(
                terms, old_hotId_words)  #该新闻热点产生一个热点标志
            # if hotId not in old_hotId_words:
            if hotIdflag:
                hotId_to_be_write[hotId] = terms  #新产生一个热点
            for news in first_result:
                # record = [hotId, news[0], news[2], utils.generateTime(),tag]#news[0],[2]分别是新闻ID 以及 产生时间
                record = [hotId, news[0], news[2],
                          utils.generateTime()]  #news[0],[2]分别是新闻ID 以及 产生时间
                hotId_newsId.append(record)
                if hotId2newsId.__contains__(hotId):
                    hotId2newsId[hotId].append(news[0])  #热点事件以及其对应的新闻事件
                else:
                    hotId2newsId[hotId] = [news[0]]
        utils.write2hotId_newsId(hotId_newsId)
        print "write to newsId done!"

        print 'hot compare take time %s' % (datetime.now() - start).seconds
        """写新的新闻热点"""
        for hotId, terms in hotId_to_be_write.items():
            if hotId in hotId2newsId:
                utils.write2hotId_words(hotId, terms, tag)
        print "write to words done!>>>>>>>>>"
        """抽取每个热点下新闻主题词"""
        # 统计采集的这一批次所有新闻的 idf
        stopdict = utils.get_stopwords()
        hotId2title = {}
        df_dict = {}
        n_doc = 0
        for line in train_titles:
            arr = line.split(' ')
            n_doc += 1
            temp_dict = {}
            #去重,因为求的 Document_Frequency,简称DF
            for tk in arr:
                # if token2id.__contains__(tk):
                temp_dict[tk] = 1
            for k, _ in temp_dict.items():
                if df_dict.__contains__(k):
                    df_dict[k] += 1
                else:
                    df_dict[k] = 1
        idf_dict = {}
        for w, df in iteritems(df_dict):
            idf_dict[w] = math.log(n_doc / float(df + 1))

        # 统计每个热点下的TF-IDF,找出每一个热点下的关键词
        for hotId, newsId_list in hotId2newsId.items():
            tf = {}
            title_words = []
            for newsId in newsId_list:
                content = raw_data[newsId].get('content')
                seg = chinese_segment.get_segment(content)
                for word in seg.split(' '):
                    if not stopdict.__contains__(
                            word) and word.strip().__len__() > 1:
                        if tf.__contains__(word):
                            tf[word] += 1
                        else:
                            tf[word] = 1
            tf_idf = {}
            for word, freq in tf.items():
                if not idf_dict.__contains__(word):
                    tf_idf[word] = 0
                else:
                    tf_idf[word] = freq * idf_dict[word]
            rs = sorted(iteritems(tf_idf), key=lambda d: d[1], reverse=True)
            if len(rs) >= 3:
                title_words = [rs[0][0], rs[1][0], rs[2][0]]
            else:
                for r in rs:
                    title_words.append(r[0])
            hotId2title[hotId] = title_words
        """依据每个热点的新闻数据做统计分析"""
        """order, hotScore, category, totalComments, totalLikes, siteCoverage, area, totalNews"""
        hot_information_data = [
        ]  # [{"hotId": 1, "order": 1, "title": "apple"},{...}]
        update = []
        insert_count = 0
        for hotId, newsId_list in hotId2newsId.items():
            # firstFindSite, lastDays
            keywords = hotId2title[hotId]
            news_count = len(newsId_list)
            totalComments = 0
            totalLikes = 0

            category_dict = {}
            site_set = set()
            local_area_count = 0
            earliestFindTime = "2049"
            firstFindSite = ""

            max_title = ""  # 得分最高的新闻的标题
            max_score = -1  # 新闻最高得分
            second_title = ""  # 得分第二高的新闻的标题
            second_score = -1  # 新闻第二得分
            for newsId in newsId_list:
                news_category = raw_data[newsId].get('categoryName')
                news_comments = raw_data[newsId].get('commentCount')
                news_likes = raw_data[newsId].get('joinCount')
                news_site = raw_data[newsId].get('siteName')
                news_source_site = raw_data[newsId].get('sourceSiteName')
                news_publishDateTime = raw_data[newsId].get('publishDateTime')
                news_area = raw_data[newsId].get('siteTypeName')
                news_title = raw_data[newsId].get('title')

                if category_dict.__contains__(news_category):
                    category_dict[news_category] += 1
                else:
                    category_dict[news_category] = 1
                if news_area == u"本地":
                    local_area_count += 1
                # 找到最早的新闻时间
                if news_publishDateTime < earliestFindTime:
                    earliestFindTime = news_publishDateTime
                    if news_source_site is None or news_source_site == '':
                        firstFindSite = news_site
                    else:
                        firstFindSite = news_source_site
                temp_score = 0
                if news_comments is not None and news_comments.isdigit():
                    totalComments += int(news_comments)
                    temp_score += int(news_comments) * 0.6
                if news_likes is not None and news_likes.isdigit():
                    totalLikes += int(news_likes)
                    temp_score += int(news_likes) * 0.4
                # 找到得分最高的两篇新闻的标题
                if temp_score > max_score:
                    second_score = max_score
                    second_title = max_title
                    max_score = temp_score
                    max_title = news_title
                else:
                    if temp_score > second_score:
                        second_score = temp_score
                        second_title = news_title
                site_set.add(news_site)
            title = [max_title, second_title]
            rs = sorted(iteritems(category_dict),
                        key=lambda d: d[1],
                        reverse=True)  # 类别统计排序
            category = rs[0][0]  # 取新闻最多的分类
            all_site_number = 6.  # 总的网站数
            siteCoverage = round(len(site_set) / all_site_number, 2)
            if (float(local_area_count) / news_count) > 0.5:
                area = "local"
            else:
                area = "global"
            if tag == 'weibo':
                hotScore = 80 * (totalComments + totalLikes +
                                 news_count * siteCoverage)
            else:
                hotScore = 0.8 * totalComments + 1.0 * totalLikes + news_count * siteCoverage

            if tag == 'weibo':
                firstFindSite = '新浪微博'
            # elif tag == 'weixin':
            #     firstFindSite = '微信'

            hotScore = round(math.log(hotScore + 1), 2)

            if hotScore > 30:
                hotScore = 20
            elif hotScore < 30 and hotScore > 20:
                hotScore = 19.8

            #统计已有热点新闻条数
            news_count_old = news_count
            if hotId in old_hotId_words.keys():
                news_count_old = utils.get_newsId_count(hotId)
                print 'old total news:%s' % news_count_old

            hot_data_wrap = {
                "hotId": hotId,
                "hotScore": hotScore,
                "title": title,
                "category": category,
                "totalComments": totalComments,
                "totalLikes": totalLikes,
                "siteCoverage": siteCoverage,
                "area": area,
                "keywords": keywords,
                "totalNews": news_count_old,
                "firstFindSite": firstFindSite,
                "datatagcategory": tag,
                "lastDays": 1
            }
            #热点发现时间
            if hotId in old_hotId_words:
                update.append(1)
            else:
                hot_data_wrap.setdefault("findTime", utils.generateTime())
                update.append(0)
                insert_count += 1

            hot_data_wrap.setdefault("lastUpdate", utils.generateTime())
            hot_information_data.append(hot_data_wrap)
        print "insert_count/operation_count:", insert_count, '/', hot_information_data.__len__(
        )
        utils.write2hot_info(hot_information_data, update)
        print "write to hot_info done!>>>>>>>>>"
        """计算order、hotScore(满分100分)"""
        hotScore_data = utils.get_hotScore_from_hot_info_time_decay(
            'datatagcategory', tag)  # 获取hotId、hotScore
        hotScore_data = sorted(hotScore_data,
                               key=lambda d: d["hotScore"],
                               reverse=True)  # 按hotScore降序排序
        # max_hotScore = hotScore_data[0].get('hotScore')  # 取最大值
        # print max_hotScore
        for i in range(len(hotScore_data)):
            hotScore_data[i].setdefault("order", i + 1)
            # hotScore_data[i].__setitem__("hotScore", int(100 * hotScore_data[i].get("hotScore") / float(max_hotScore)))
        utils.update_hot_info(hotScore_data)
        print "update rank to hot_info done!>>>>>>>>>"
    else:
        print "empty raw data"
Beispiel #18
0
        # tmp 表示每一行自由组合后的结果(n gram)
        # 雪落/ 山庄/ 不是/ 一座/ 山庄/ 只是/ 一个/ 客栈
        # tmp: [['雪落'], ['山庄'], ['不是'], ['一座'],['山庄'],['只是'],['一个'],['客栈'],
        # ['雪落', '山庄'], ['山庄', '不是'], ['不是', '一座'],['一座','山庄'],['山庄','只是'],['只是','一个'],['一个','客栈']
        #  ['雪落', '山庄', '不是'], ['山庄', '不是', '一座'],[不是'','一座','山庄'],...
        ngrams = generate_ngram(word_list, 3)
        #print(ngrams)
        for d in ngrams:
            root.add(d)
    print('------> 插入成功')


if __name__ == "__main__":

    root = TrieNode('*', None)
    stopwords = get_stopwords('./data/stopword.txt')
    data = load_data('./data/data.txt', stopwords)
    # 将新的文章插入到Root中
    load_data_2_root(data)

    # 定义取TOP5个
    topN = 20
    result, add_word = root.find_word(topN)
    # 如果想要调试和选择其他的阈值,可以print result来调整
    # print("\n----\n", result)
    print("\n----\n", '增加了 %d 个新词, 词语和得分分别为: \n' % len(add_word))
    print('#############################')
    for word, score in add_word.items():
        print(word + ' ---->  ', score)
    print('#############################')
def main(distance_type, refresh):
    '''	
		INPUT
		distance_type<string>: Gaussian for gaussian projection OR
				Sparse 	 for Achiloptas projection
				default: Sparse

		refresh<bool>: refreshes data model, recomputing word2idx, bow

		OUTPUT
			dist<float<D,D>>: matrix of distances using random projections algorithm
			values are always saved following the pattern <projection>_<eps>_distance_matrix.txt						
				D: original count of documents


		
	'''

    filename_distance_matrix = '%s_distance_matrix.txt' % (distance_type)

    if refresh:
        devel_path = DATASET_PATH + 'development.json'
        print('reading...\r')
        data = pd.read_json(devel_path, orient='records')
        print('reading...done\r')

        print('')
        print('tokenizing...\r')
        this_stemmer = get_stemmer()
        this_stopwords = get_stopwords()
        tokenfy = lambda x: tokenizer2(
            x, stemmer=this_stemmer, stopwords=this_stopwords)

        data['token_description'] = data['description'].apply(tokenfy)
        print('tokenizing...done\r')

        print('')
        print('indexing...\r')
        word2idx = {}
        data = data2idx(data,
                        word2idx,
                        colname='token_description',
                        new_colname='idx_description')
        print('indexing...done\r')

        print('')
        print('generating bag of words...\r')
        print('')
        bow2 = data2bow(data, word2idx)
        print('generating bag of words...done\r')

    else:
        print('')
        print('retrieving word2idx...')
        word2idx_path = DATASET_PATH + 'word2idx2.txt'
        df = pd.read_csv(word2idx_path, sep=' ', index_col=0, header=None)
        word2idx = {k: v for k, v in zip(df.index, df.iloc[:, 0])}
        print('retrieving word2idx...done')

        print('retrieving bow2...')
        bow2_path = DATASET_PATH + 'bow2.txt'
        df = pd.read_csv(bow2_path,
                         sep=' ',
                         index_col=None,
                         header=None,
                         skiprows=1)
        bow2 = df.as_matrix()
        print('retrieving bow2...done')

    print('')
    print('compute %s distance...\r' % (distance_type))
    dist = bow2dist(bow2, verbose=True, distance_type=distance_type)
    print('compute %s distance...done\r' % (distance_type))

    print('')
    print('storing %s distance matrix...\r' % (distance_type))

    matrix2txt(dist, filename=filename_distance_matrix)
    print('storing %s distance matrix...done\r' % (distance_type))
from utils import get_stopwords

STOPWORDS = get_stopwords()
COLUMNS = [
    "utc_time", "country_name", "country_code", "place_type", "place_name",
    "language", "username", "user_screen_name", "timezone_offset",
    "number_of_friends", "tweet_text", "latitude", "longitude"
]
def main(projection_type, eps, store, refresh):
    '''	
		INPUT
		projection_type<string>: Gaussian for gaussian projection OR
				Sparse 	 for Achiloptas projection
				default: Sparse

		eps<float>: threshold for acceptable distorsions 
				higher eps -> higher theoretical probability of distorsions
				is bounded between 0-1

		refresh<bool>: refreshes data model, recomputing word2idx, bow

		store<bool>: stores 3 intermediary results: word2idx, bow, proj_bow. 

		OUTPUT
			dist<float<D,D>>: matrix of distances using random projections algorithm
			values are always saved following the pattern <projection>_<eps>_distance_matrix.txt						
				D: original count of documents

			proj_bow<int<v,D>>: projection over bag-of-words using random projections algorithm
			values are always saved following the pattern <projection>_<eps>_bow.txt									
				v<int>: v<<V is the new vocabulary size 
				D<int>: original count of documents			

		
	'''

    startTime = datetime.now()
    dt = startTime.strftime('%Y-%m-%d %H:%M:%S')
    filename_distance_matrix = '%s_%.1f_%s_distance_matrix.txt' % (
        projection_type, eps, dt)
    filename_projection_bow = '%s_%.1f_%s_bow.txt' % (projection_type, eps, dt)

    profiler_breakdown = dict([
        ('io', timedelta(0)), ('tokenizer', timedelta(0)),
        ('indexing', timedelta(0)), ('BoW', timedelta(0)),
        ('rnd_proj', timedelta(0)), ('BoW2Dist', timedelta(0)),
        ('total', timedelta(0))
    ])
    profiler_modelparams = {}

    breakdownTime = startTime
    if refresh:
        devel_path = DATASET_PATH + 'development.json'
        print('reading...\r')
        data = pd.read_json(devel_path, orient='records')
        print('reading...done\r')
        profiler_breakdown['io'] += datetime.now() - breakdownTime

        breakdownTime = datetime.now()
        print('')
        print('tokenizing...\r')
        this_stemmer = get_stemmer()
        this_stopwords = get_stopwords()
        tokenfy = lambda x: tokenizer2(
            x, stemmer=this_stemmer, stopwords=this_stopwords)

        data['token_description'] = data['description'].apply(tokenfy)
        print('tokenizing...done\r')
        profiler_breakdown['tokenizer'] += datetime.now() - breakdownTime

        breakdownTime = datetime.now()
        print('')
        print('indexing...\r')
        word2idx = {}
        data = data2idx(data,
                        word2idx,
                        colname='token_description',
                        new_colname='idx_description')
        print('indexing...done\r')
        profiler_breakdown['indexing'] += datetime.now() - breakdownTime

        breakdownTime = datetime.now()
        print('')
        print('generating bag of words...\r')
        print('')
        bow2 = data2bow(data, word2idx)
        print('generating bag of words...done\r')
        profiler_breakdown['BoW'] += datetime.now() - breakdownTime

        if store:
            breakdownTime = datetime.now()
            print('')
            print('storing indexes...\r')
            word2idx2txt(word2idx, filename='word2idx2.txt')
            print('storing indexes...done\r')

            print('')
            print('storing bag of words...')
            matrix2txt(bow2, filename='bow2.txt')
            print('storing bag of words...done\r')

            profiler_breakdown['io'] += datetime.now() - breakdownTime
    else:
        breakdownTime = datetime.now()
        print('')
        print('retrieving word2idx...')
        word2idx_path = DATASET_PATH + 'word2idx2.txt'
        df = pd.read_csv(word2idx_path, sep=' ', index_col=0, header=None)
        word2idx = {k: v for k, v in zip(df.index, df.iloc[:, 0])}
        print('retrieving word2idx...done')

        print('retrieving bow2...')
        bow2_path = DATASET_PATH + 'bow2.txt'
        df = pd.read_csv(bow2_path,
                         sep=' ',
                         index_col=None,
                         header=None,
                         skiprows=1)
        bow2 = df.as_matrix()
        print('retrieving bow2...done')
        profiler_breakdown['io'] += datetime.now() - breakdownTime

    breakdownTime = datetime.now()
    print('')
    print('compute random projection...\r')
    proj = bow2rnd_proj(bow2, projection_type=projection_type, eps=eps)
    print(
        'compute random projection... done new (reduced) dimensions:%dx%d\r' %
        proj.shape)
    profiler_breakdown['rnd_proj'] += datetime.now() - breakdownTime

    if store:
        breakdownTime = datetime.now()
        print('')
        print('storing bag of words...\r')
        matrix2txt(proj, filename=filename_projection_bow)
        print('storing bag of words...done\r')
        profiler_breakdown['io'] += datetime.now() - breakdownTime

    breakdownTime = datetime.now()
    print('')
    print('compute %s distance...\r' % (projection_type))
    proj_dist = bow2dist(proj)

    print('compute %s distance...done\r' % (projection_type))
    profiler_breakdown['BoW2Dist'] += datetime.now() - breakdownTime

    breakdownTime = datetime.now()
    print('')
    print('storing %s distance matrix...\r' % (projection_type))
    dt = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    matrix2txt(proj_dist, filename=filename_distance_matrix)
    print('storing %s distance matrix...done\r' % (projection_type))
    profiler_breakdown['io'] += datetime.now() - breakdownTime
    V = proj.shape[0]

    profiler_breakdown['total'] = sum(profiler_breakdown.values(), timedelta())
    profiler_breakdown = {k: str(td) for k, td in profiler_breakdown.items()}
    filename_profiler = '%s_%.1f_%d_profiler_%s.txt' % (projection_type, eps,
                                                        V, dt)
    dict2txt(profiler_breakdown, filename=filename_profiler)
Beispiel #22
0
    print('------> 插入节点')
    #对于每一行句子进行n-gram的组合
    for word_list in data:
        # tmp 表示每一行自由组合后的结果(n gram)
        # tmp: [['它'], ['是'], ['小'], ['狗'], ['它', '是'], ['是', '小'], ['小', '狗'], ['它', '是', '小'], ['是', '小', '狗']]
        ngrams = generate_ngram(word_list, 3)
        #建立存储这些词汇的字典树,存储词汇出现的次数
        for d in ngrams:
            root.add(d)
    print('------> 插入成功')


if __name__ == "__main__":
    #root_name = basedir + "/data/root.pkl"
    root_name = basedir + "/data/jianzhu.pkl"
    stopwords = get_stopwords()
    if os.path.exists(root_name):
        root = load_model(root_name)
    else:
        #文档不能正确反映单个词的词频,所以引入Jieba自带的外部词典
        dict_name = basedir + '/data/dict.txt'
        #读取字典文件,取出词频大于2的建立字典{单词:频数}
        word_freq = load_dictionary(dict_name)
        #建立词汇树
        root = TrieNode('*', word_freq)
        save_model(root, root_name)

    # 加载新的文章
    #filename = 'data/demo.txt'
    filename = 'data/jianzhu.txt'
    #data是二维数组,存储[[第一行list][第二行list].....]
Beispiel #23
0
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [a + b[1:] for a, b in splits if b]
    transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b) > 1]
    replaces = [a + c + b[1:] for a, b in splits for c in ALPHABET if b]
    inserts = [a + c + b for a, b in splits for c in ALPHABET]
    return set(deletes + transposes + replaces + inserts)


def known_edits2(word, forms):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in forms)


if __name__ == "__main__":
    print("Please wait, reading data files...")

    stopwords = get_stopwords()
    forms = get_corrections(FORMS_FILE, stopwords, FORMS_ENCODING)
    corpus_statistics = corpus_stats(CORPUS, len(forms), stopwords)
    error_statistics = error_stats(ERROR_FILE)

    print("Please enter words to correct them.")

    while True:
        try:
            word = input("> ")
            time1 = time()
            # noinspection PyTypeChecker
            corrections = correct_word(
                word, corpus_statistics, error_statistics,
                compute_possible_corrections(word, forms) if ENHANCED_VERSION else forms
            )