Ejemplo n.º 1
0
def stock_price_ambiguity(max_workers):
    cur = connect(db_path).cursor()
    cur.execute('SELECT name FROM sqlite_master WHERE type="table"')
    stock_list = [row[0] for row in cur]
    cur.close()

    result = File_manager('analyzed', 'ambiguity')
    t = int(result.ver['ambiguity']) * 1000000

    try:
        result_df = pd.read_csv(result.path, index_col=0)
    except EmptyDataError:
        result_df = None

    with Pool(processes=int(max_workers), initializer=init) as p:
        stock_ambiguity = map(worker, stock_list, t, pm_pool=p, pm_pbar=True)
    stock_ambiguity = pd.concat(stock_ambiguity, axis=1)

    if result_df is not None:
        stock_ambiguity = pd.concat([result_df.iloc[:-1], stock_ambiguity])

    stock_ambiguity.sort_index(axis=0, inplace=True)

    result.update_version({'ambiguity': stock_ambiguity.index[-1]})
    stock_ambiguity.to_csv(result.path)
def disaster_message_tf_idf(max_workers):
    input = File_manager('preprocessed', 'disasterMessage')
    tf_idf = File_manager('analyzed', 'disasterMessageTFIDF')
    idf = File_manager('analyzed', 'disasterMessageIDF')
    cmpr = idf.compare_version(input.ver)
    new_ver = input.ver.copy()

    if new_ver['disasterMessage'] == '0':
        return
    if idf.ver[cmpr[0]] == new_ver['disasterMessage'] and len(cmpr) == 1:
        return

    new_ver['disasterMessageTFIDF'] = new_ver['disasterMessage']
    del new_ver['disasterMessage']
    tf_idf.update_version(new_ver)
    new_ver['disasterMessageIDF'] = new_ver['disasterMessageTFIDF']
    del new_ver['disasterMessageTFIDF']
    idf.update_version(new_ver)

    preprocessed_data = read_csv(input.path)
    docs = preprocessed_data['tokens']
    tfidfv = TfidfVectorizer(
        lowercase=False, token_pattern=r'(?u)[^┃]+?(?=┃|$)'
        ).fit(docs)
    vocabs = sorted(tfidfv.vocabulary_, key=tfidfv.vocabulary_.get)
    tf_idf_data = DataFrame(tfidfv.transform(docs).toarray(), columns=vocabs)
    idf_data = Series(tfidfv.idf_, index=vocabs).sort_values()

    tf_idf_data.to_csv(tf_idf.path, index=False)
    idf_data.to_csv(idf.path, header=False)
def disaster_message_crawler(max_workers):
    total = None
    minimum = 10
    message_list = []
    output = File_manager('raw', 'disasterMessage')
    n = int(output.parse_version()['disasterMessage'])

    options = webdriver.ChromeOptions()
    prefs = {'profile.default_content_setting_values': {
        'cookies': 2, 'images': 2, 'plugins': 2, 'popups': 2, 'geolocation': 2,
        'notifications': 2, 'auto_select_certificate': 2, 'fullscreen': 2,
        'mouselock': 2, 'mixed_script': 2, 'media_stream': 2,
        'media_stream_mic': 2, 'media_stream_camera': 2,
        'protocol_handlers': 2, 'ppapi_broker': 2, 'automatic_downloads': 2,
        'midi_sysex': 2, 'push_messaging': 2, 'ssl_cert_decisions': 2,
        'metro_switch_to_desktop': 2, 'protected_media_identifier': 2,
        'app_banner': 2, 'site_engagement': 2, 'durable_storage': 2
        }}
    options.add_experimental_option('prefs', prefs)
    options.add_argument('disable-infobars')
    options.add_argument('--disable-extensions')
    options.add_argument('--headless')

    with webdriver.Chrome(options=options) as driver:
        wait = WebDriverWait(driver, 5)
        init_page(driver, wait)

        total = int(driver.find_element_by_id('totCnt').text)

        if total == n:
            print('Collected 0 new item.')
            return

        num_workers = min(max((total - n) // minimum, 1), max_workers)
        split = get_split(total, n, num_workers)

        with Manager() as manager:
            lst = manager.list()
            workers = [Process(target=worker, args=(split[i], options, lst))
                       for i in range(1, num_workers)]

            for wrkr in workers:
                wrkr.start()

            message_list.append(crawl_messages(driver, wait, split[0]))

            for wrkr in workers:
                wrkr.join()

            message_list.extend(lst)

    params = {
        'mode': 'a' if n else 'w',
        'header': False if n else True,
        'index': False
        }
    concat(message_list).sort_values(by='time').to_csv(output.path, **params)
    output.update_version({'disasterMessage': str(total)})
    print(f'Collected {total - n} new items.')
def disaster_message_preprocessor(max_workers):
    mode = 'w'
    input = File_manager('raw', 'disasterMessage')
    output = File_manager('preprocessed', 'disasterMessage')
    new_ver = input.ver.copy()

    if new_ver['disasterMessage'] == '0':
        return

    raw = read_csv(input.path)
    t = output.ver['disasterMessage']
    new_ver.update(File_manager('ref', 'userdic', format='txt').ver)
    new_ver.update(File_manager('ref', 'stopwords').ver)
    compare = output.compare_version(new_ver)
    header = True
    n = len(compare)

    if n:
        output.update_version(new_ver)
        if n == 1 and compare[0] == 'disasterMessage' and t != '0':
            mode = 'a'
            header = False
            raw = raw.iloc[t:]
    else:
        return

    df_split = array_split(raw, max_workers)
    df_list = parmap.map(tsk,
                         df_split,
                         pm_pbar=True,
                         pm_pool=Pool(max_workers, initializer=initializer))
    concat(df_list).to_csv(output.path, mode=mode, index=False, header=header)
Ejemplo n.º 5
0
class Editor:
    is_parent = True

    def __init__(self, file_name, format):
        self.file = File_manager('ref', file_name, format=format)

    def version_up(self, file_name):
        ver = int(self.file.ver[file_name]) + 1
        self.file.update_version({file_name: str(ver)})

    def delete(self, data, header, file_name):
        s = read_csv(self.file.path, squeeze=True, header=header)
        s[~s.isin(data)].to_csv(self.file.path,
                                index=False,
                                header=bool(header))
        if self.is_parent:
            self.version_up(file_name)
        else:
            self.version_up()

    def add(self, data, header, file_name):
        s1 = read_csv(self.file.path, squeeze=True, header=header)
        s2 = Series(data)
        concat([s1,
                s2[~s2.isin(s1)]]).sort_values().to_csv(self.file.path,
                                                        index=False,
                                                        header=bool(header))
        if self.is_parent:
            self.version_up(file_name)
        else:
            self.version_up()

    def edit_with_file(self, file_name, header, mode):
        path = File_manager('ref', file_name).path
        s = read_csv(path, squeeze=True, header=None)

        if self.is_parent:
            if mode == 'a':
                self.add(s, header, file_name)
            elif mode == 'd':
                self.delete(s, header, file_name)
        else:
            if mode == 'a':
                self.add(s)
            elif mode == 'd':
                self.delete(s)

        open(path, 'w').close()
Ejemplo n.º 6
0
    def edit_with_file(self, file_name, header, mode):
        path = File_manager('ref', file_name).path
        s = read_csv(path, squeeze=True, header=None)

        if self.is_parent:
            if mode == 'a':
                self.add(s, header, file_name)
            elif mode == 'd':
                self.delete(s, header, file_name)
        else:
            if mode == 'a':
                self.add(s)
            elif mode == 'd':
                self.delete(s)

        open(path, 'w').close()
def crawl_messages(driver, wait, rng):
    if rng[1] == 1599:
        date = '20160609'
        if rng[0] == 1:
            start = '2011/11/18 07:43:44'
        else:
            output = File_manager('raw', 'disasterMessage')
            start = read_csv(output.path).iloc[-1]['time']

        inputs = driver.find_elements_by_xpath('//ul//input')
        inputs[1].clear()
        inputs[2].clear()
        inputs[0].send_keys(start)
        inputs[1].send_keys(date)
        inputs[2].send_keys(date)

        check_stale = driver.find_element_by_id('bbs_tr_0_bbs_title')
        driver.find_element_by_class_name('search_btn').click()
        wait.until(EC.staleness_of(check_stale))
        driver.find_element_by_id('bbs_tr_0_bbs_title').click()
    else:
        while True:
            total = int(driver.find_element_by_id('totCnt').text)
            page_input = driver.find_element_by_id('bbs_page')
            page_input.clear()
            page_input.send_keys((total - rng[0]) // 10 + 1)
            check_stale = driver.find_element_by_id('bbs_tr_0_bbs_title')
            driver.find_element_by_class_name('go_btn').click()
            wait.until(EC.staleness_of(check_stale))

            total = int(driver.find_element_by_id('totCnt').text)
            i = (total - rng[0]) % 10
            res = int(driver.find_element_by_id(f'bbs_tr_{i}_num_td').text)

            if rng[0] == res:
                driver.find_element_by_id(f'bbs_tr_{i}_bbs_title').click()
                break
            else:
                continue

    messages = {'time': [], 'body': [], 'to': []}
    for _ in range(*rng):
        parse_messages(driver, wait, messages)

    return DataFrame(messages)
def disaster_message_tf_di(max_workers):
    input = File_manager('preprocessed', 'disasterMessage')
    output = File_manager('analyzed', 'disasterMessageTFDI')
    cmpr = output.compare_version(input.ver)
    new_ver = input.ver.copy()

    if new_ver['disasterMessage'] == '0':
        return
    if output.ver[cmpr[0]] == new_ver['disasterMessage'] and len(cmpr) == 1:
        return

    new_ver['disasterMessageTFDI'] = new_ver['disasterMessage']
    del new_ver['disasterMessage']
    output.update_version(new_ver)

    preprocessed = read_csv(input.path)
    preprocessed['time'] = to_datetime(preprocessed['time'])
    tokens = set(w for doc in preprocessed['tokens'] for w in doc.split('┃'))
    tokens = list(tokens)
    tf_di = DataFrame(index=tokens)
    split = preprocessed.groupby(preprocessed.time.dt.year)
    res = parmap.map(worker,
                     split,
                     tokens,
                     pm_pbar=True,
                     pm_processes=max_workers)

    for year, freq in res:
        tf_di[year] = freq

    tf_di = tf_di.reindex(sorted(tf_di.columns), axis=1)

    n = len(tf_di.columns)
    tf_di['tf_di'] = 0

    for i, (name, col) in enumerate(tf_di.iteritems()):
        if name == 'tf_di':
            break

        tf_di['tf_di'] += col * (i + 1) / n

    tf_di.sort_values(by='tf_di', ascending=False).to_csv(output.path)
def tokenizer(df):
    stopwords = File_manager('ref', 'stopwords')
    stopwords_data = read_csv(stopwords.path, squeeze=True)
    tokens = komoran.morphs(df['message'])
    filtered = filter(lambda x: not stopwords_data.isin([x]).any(), tokens)
    return '┃'.join(filtered)
def initializer():
    global komoran
    userdic = File_manager('ref', 'userdic', format='txt')
    komoran = Komoran(userdic=userdic.path)
Ejemplo n.º 11
0
 def __init__(self, file_name, format):
     self.file = File_manager('ref', file_name, format=format)