def write_yearly_cluster_matches():
    def get_file_year(file_name):
        return int(get_file_name_without_ext(file_name))

    base_dir = os.path.join(Config.keywords_dir(), 'yearly_matches')
    create_dir_if_not_exists(base_dir)

    cluster_files = glob(
        os.path.join(Config.keywords_dir(), 'yearly_clusters', '*.pickle')
    )
    for prev_file_n, curr_file_n in window(sorted(cluster_files,
                                                  key=get_file_year)):
        print(f'Starting for {get_file_year(prev_file_n)}-{get_file_year(curr_file_n)}')
        with open(prev_file_n, 'rb') as prev_file:
            word_prev_cluster, prev_clusters = pickle.load(prev_file)
        with open(curr_file_n, 'rb') as curr_file:
            word_cluster, curr_clusters = pickle.load(curr_file)

        matches = find_cluster_matches_semantic(
            word_prev_cluster, prev_clusters,
            word_cluster, curr_clusters
        )
        matches_file_name = os.path.join(
            base_dir, f'matches_{get_file_year(curr_file_n)}.pickle'
        )
        with open(matches_file_name, 'wb') as matches_file:
            pickle.dump(matches, matches_file)
def run_yearly():
    """
    Creates yearly topic models and saves it to disk.
    """
    print(f'Reading files from {Config.risk_dir()}')
    corpus = get_corpus()
    yearly_doc_ids = defaultdict(list)
    for k in corpus.keys():
        yearly_doc_ids[ReportInfo.from_doc_id(k).start_date.year].append(k)
    print(f'Read {len(corpus)} files.')

    base_dir = os.path.join(Config.top2vec_models_dir(), 'yearly_models')
    create_dir_if_not_exists(base_dir)
    print(f'Storing yearly models to {base_dir}.')

    for year, doc_ids in tqdm(yearly_doc_ids.items(),
                              total=len(yearly_doc_ids)):
        yearly_corpus = [corpus[d] for d in doc_ids]
        try:
            model = Top2Vec(documents=yearly_corpus,
                            document_ids=doc_ids,
                            tokenizer=RiskSectionCleaner(),
                            keep_documents=False,
                            speed='learn',
                            workers=24)
            model.save(os.path.join(base_dir, f'{year}_topics'))
        except:
            print(f'Could not create topic model for year: {year}')
            continue
def write_keywords():
    keywords_dir = Config.text_rank_keywords_dir()
    create_dir_if_not_exists(keywords_dir)
    print(f'Writing found keywords to {keywords_dir}')

    risk_files = get_risk_filenames()
    with ProcessPoolExecutor(max_workers=4) as executor:
        tasks = [
            executor.submit(_write_keywords_for_risk_file, risk_file)
            for risk_file in risk_files
        ]

        for task in tqdm(as_completed(tasks), total=len(tasks)):
            pass
def write_doc_sentiment_files():
    base_dir = Config.risk_sentiment_dir()
    create_dir_if_not_exists(base_dir)

    for risk_file in tqdm(get_risk_filenames()):
        risk_section = risk_file.read_text(encoding='utf-8')
        sentiment_df = get_sentiment(risk_section)

        report_info = report_info_from_risk_path(risk_file)
        file_path = os.path.join(base_dir, str(report_info.cik))
        create_dir_if_not_exists(file_path)

        filename = get_file_name_without_ext(report_info.get_file_name())
        sentiment_filename = os.path.join(file_path, f'{filename}.pickle')

        sentiment_df.to_pickle(path=sentiment_filename)
def _write_keywords_for_risk_file(risk_file_path):
    keywords_dir = Config.text_rank_keywords_dir()
    report_info = report_info_from_risk_path(risk_file_path)
    cik_dir = os.path.join(keywords_dir, str(report_info.cik))
    create_dir_if_not_exists(cik_dir)

    base_filename = os.path.join(
        cik_dir, get_file_name_without_ext(report_info.get_file_name()))

    try:
        text = risk_file_path.read_text(encoding='utf-8')
    except UnicodeDecodeError:
        return

    keyword_extractor = KeywordExtractor(min_length=1,
                                         max_length=3,
                                         num_keywords=100)
    tr_keywords = keyword_extractor.extract_using_text_rank(text)
    with open(base_filename + '.txt', 'w+', encoding='utf-8') as keywords_file:
        keywords_file.write('\n'.join(tr_keywords))
def write_yearly_keyword_clusters():
    import risk_detection.analysis.keyword_extraction
    keywords = risk_detection.analysis.keyword_extraction.get_all_keywords()

    print('Creating keyword clusters by year.')
    keyword_clusters_by_year = dict()

    base_dir = os.path.join(Config.keywords_dir(), 'yearly_clusters')
    create_dir_if_not_exists(base_dir)

    keys_by_year = sorted(keywords.keys(), key=lambda x: x.start_date)
    for year, files in groupby(keys_by_year, key=lambda x: x.start_date.year):
        print(f'Creating clusters for {year}')
        all_keywords = sorted(
            set(chain(*[keywords[file].keywords for file in files]))
        )
        # import pdb; pdb.set_trace()
        cluster_lookup, keyword_clusters = cluster(all_keywords)
        with open(os.path.join(base_dir, f'{year}.pickle'), 'wb') as dump_file:
            pickle.dump((cluster_lookup, keyword_clusters), dump_file,
                        protocol=pickle.HIGHEST_PROTOCOL)
Example #7
0
    def decorator(func):
        cache_dir = os.path.join(_base_cache_dir, dir_name)
        create_dir_if_not_exists(cache_dir)

        @wraps(func)
        def wrapper(obj, text):
            ## TODO: Hashing fails as python produces different hashes for
            ## the same text over different runs
            hashh = hashlib.sha256(text.encode('utf-8')).hexdigest()
            file_path = os.path.join(cache_dir, f'{hashh}.pickle')
            try:
                # Get from cache
                with open(file_path, 'rb') as f:
                    return pickle.load(f)
            except FileNotFoundError:
                # Cache result
                result = func(obj, text)
                with open(file_path, 'wb') as f:
                    pickle.dump(result, f, protocol=pickle.HIGHEST_PROTOCOL)
                return result

        return wrapper
Example #8
0
        return industry_groups

    @staticmethod
    def create_by_sic_char_length(sic_c: str = 'sic2') -> List[IndustryGroup]:
        group = IndustryGroupCreator.cik_sic_df[['cik', sic_c]].groupby(sic_c)
        res = list()
        for sic, group_df in group:
            ciks = set(group_df['cik'].unique())
            # TODO: Fix
            # res.append(IndustryGroup(sic, ciks))
        return res


if __name__ == '__main__':
    base_dir = os.path.join(Config.keywords_dir(), 'industry_groups',
                            'sic_groups')
    create_dir_if_not_exists(base_dir)

    groups = IndustryGroupCreator.create_by_sic_division()
    print('Created industry groups.')
    for group in groups:
        print(f'Creating clusters for SIC division {group.sic_category}')
        industry_yearly_clusters = group.cluster_keywords()
        if industry_yearly_clusters:
            with open(os.path.join(base_dir, f'{group.sic_category}.pickle'),
                      'wb') as f:
                pickle.dump(industry_yearly_clusters,
                            f,
                            protocol=pickle.HIGHEST_PROTOCOL)
            print(f'Saved clusters for SIC division {group.sic_category}')