Ejemplo n.º 1
0
    manager = Manager()
    finish = Value("i", 0)
    res = manager.list()
    args = parser.parse_args()
    keyword_path = args.keyword
    start_str = args.start.split("-")
    end_str = args.end.split("-")
    start = date(int(start_str[0]), int(start_str[1]), int(start_str[2]))
    end = date(int(end_str[0]), int(end_str[1]), int(end_str[2]))
    keywords = list()
    if keyword_path is not None:
        with open(keyword_path, "r", encoding="utf-8-sig") as keywds:
            for word in keywds:
                keywords.append(word.replace("\n", ""))
    now = start
    dbutil = DBUtility()

    while True:
        if now > end:
            break
        dt_queue.put(str(now))
        now += timedelta(days = 1)


    for _ in range(WORKERS):
        t = Process(target = Download, args = (dt_queue, writelock, dbutil, finish, res, keywords))
        t.daemon = True
        t.start()

    total = dt_queue.qsize()
Ejemplo n.º 2
0
             transform=plt.gcf().transFigure)

    plt.subplots_adjust(left=0.3)

    plt.savefig(os.path.join(base_path, export_dir + "/" + filename + ".png"))


if __name__ == '__main__':
    lock = Lock()
    writelock = Lock()
    manager = Manager()

    num = Value("i", 0)
    finish = Value("i", 0)

    dbutils = DBUtility()

    if not os.path.exists(os.path.join(base_path, export_dir)):
        os.mkdir(os.path.join(base_path, export_dir))

    days = (end - start).days

    contents = list()

    sys.stdout.write('\n')
    sys.stdout.write('\r')
    sys.stdout.write("Calculating Similarity... 0%")
    sys.stdout.flush()

    for _ in range(WORKERS):
        t = Process(target=Calculate_Sim,
Ejemplo n.º 3
0
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from multiprocessing import cpu_count
import pickle
import csv
from utility.DBUtility import DBUtility

dbutils = DBUtility()

stpwrdpath = "stop_word_all.txt"
STOPWORDS = list()

aus_accounts = [
    "华人瞰世界", "今日悉尼", "微悉尼", "澳洲微报", "悉尼印象", "Australia News", "澳洲中文台"
]
aus_articles = list()

with open(stpwrdpath, 'r', encoding="utf-8-sig") as stopwords:
    for word in stopwords:
        STOPWORDS.append(word.replace("\n", ""))

with open("./segmentation.pickle", "rb") as f:
    segementations = pickle.load(f)

articles = dbutils.GetArticles({})
for article in articles:
    if article["account"] not in aus_articles:
        aus_articles.append(article["_id"])

sn_list = list()
contents = list()