コード例 #1
0
ファイル: core.py プロジェクト: mkilli83/twitter-classifier
def get_raw_tweets(query_dict):
    """
    Get raw tweets
    :param query_dict:
        query_string: 'datacamp lang:en'
        time_since: '2019-03-01'
        time_until: '2019-05-01'
        max_tweets: 0 for unlimited
    :return: dataframe
    """

    file_name = _convert_query_dict_to_str_as_filename(query_dict)
    save_raw_file_name = paths.raw_tweets / f"raw_{file_name}.csv"
    print(file_name)
    if save_raw_file_name.is_file():
        print(f"Raw file {repr(save_raw_file_name)} already exists, reload")
        tweet_df = load_csv(save_raw_file_name)
    else:
        _validate_query(query_dict)

        print(f"Getting raw tweets with query:\n{query_dict!r}")
        tweet_criteria = _create_search_criteria(**query_dict)
        tweet_objects = _get_tweet_object(tweet_criteria)
        tweet_df = _convert_tweets_to_dataframe(tweet_objects)

        print(f"Saving raw tweets to: {repr(save_raw_file_name)}")
        tweet_df.to_csv(save_raw_file_name, index=False)

    print("Done getting raw tweets.")
    return tweet_df
コード例 #2
0
    def __init__(self, splits: str):
        self.name = splits
        self.splits = splits.split(',')

        # Loading datasets
        self.data = []
        idx = 0
        for split in self.splits:
            # self.data.extend(json.load(open("data/vqa/%s.json" % split)))
            for row in load_csv('{}{}{}.csv'.format(VCSD_DATA_ROOT,
                                                    VCSD_FILE_BASE, split),
                                delimiter='\t'):
                r = {
                    'id': idx,
                    'raw_image_id': row['raw_image_id'],
                    'image_id': row['image_id'],
                    'utterance': row['utterance'],
                    'response': row['response'],
                    'label': row['label'],
                }
                self.data.append(r)
                idx += 1

        print("Load %d data from split(s) %s." % (len(self.data), self.name))

        # Convert list to dict (for evaluation)
        self.id2datum = {datum['id']: datum for datum in self.data}
コード例 #3
0
def read_csv(input_file):
    """Reads a tab separated value file."""
    df = load_csv(input_file, header=0)
    fact = df['fact'].tolist()
    jlabel = df.loc[:, 'label'].values
    lines = [[jlabel[i], fact[i]] for i in range(len(jlabel))
             if type(fact[i]) == str]
    return lines
コード例 #4
0
 def _read_csv(cls, input_file, content_col=None, label_col=None):
     """Reads a tab separated value file."""
     if content_col == None:
         content_col = "content"
     if label_col == None:
         label_col = 'label_out'
     df = load_csv(input_file, header=0)
     df.fillna("|", inplace=True)
     job_fact = df[content_col].tolist()
     labels = df[label_col].to_list()
     lines = [[labels[i], job_fact[i]] for i in range(len(labels))
              if type(job_fact[i]) == str]
     print('Length of data:', len(lines))
     print("Fact example:", job_fact[0])
     print("Label example:", labels[0])
     return lines
コード例 #5
0
 def get_train_labels(self, data_dir, label_col):
     df = load_csv(data_dir, header=0)
     label_list = df[label_col].apply(
         lambda x: self.change_label_to_id(x, label_col))
     labels = [label_list[i] for i in range(len(label_list))]
     return labels
コード例 #6
0
 def read_raw_csv(self, input_file):
     return load_csv(input_file, header=0)
コード例 #7
0
def main(verbose: bool):
    oenace_codes = load_csv(OENACE_FILE_PATH)
    sitc_codes = load_csv(SITC2_FILE_PATH)
    sitc_enriched_codes = load_enriched_sitc_codes(SITC2_ENRICHED_FILE_PATH)

    results = []
    for use_enriched_sitc in [True, False]:
        for TEXT_SIMILARITY_THRESHOLD in [40, 50, 60, 70, 75, 80, 90, 95]:
            print(f'Doing {TEXT_SIMILARITY_THRESHOLD}')
            total = 0
            oenace_candidates = {}

            for sitc_code, sitc_title in sitc_codes.items():
                total += 1

                sitc_title_extendend = [sitc_title]

                # if we want to use enriched version from the correspondence tables, extend sitc titles
                if use_enriched_sitc:
                    extending = sitc_enriched_codes.get(sitc_code, [])

                    if extending and verbose:
                        print(f'Extending "{sitc_title}" with: {extending}')

                    sitc_title_extendend += extending

                # hold a list of possible mapping candidates from oenace codes for each method
                oenace_candidates[sitc_code] = {
                    'text_similarity': [],
                    'inverted_index': [],
                }
                # step1: try to do exact name matching
                for oenace_code, oenace_title in oenace_codes.items():
                    text_similarity = perform_text_similarity(
                        sitc_title=sitc_title,
                        oeance_title=oenace_title,
                        should_extend_sitc=use_enriched_sitc,
                        sitc_extensions=sitc_title_extendend)

                    if text_similarity > TEXT_SIMILARITY_THRESHOLD:
                        oenace_candidates[sitc_code]['text_similarity'].append(
                            {
                                'oenace_code': oenace_code,
                                'oenace_title': oenace_title,
                                'similarity': text_similarity,
                            })

            total_mapped = 0
            length_of_mapped_items = 0
            for sitc_code in oenace_candidates.keys():
                if oenace_candidates[sitc_code].get('text_similarity'):
                    total_mapped += 1
                    length_of_mapped_items += len(
                        oenace_candidates[sitc_code]['text_similarity'])

            results.append({
                'threshold':
                TEXT_SIMILARITY_THRESHOLD,
                'enriched':
                use_enriched_sitc,
                'total_mapped_pct':
                round((total_mapped / len(sitc_codes)) * 100, 2),
                'avg_length':
                round(length_of_mapped_items / total_mapped, 2)
            })

    import csv
    SITC_PATH_PREPROCESSED = '../data/chart_results/text_similarity_thresholds.csv'
    with open(SITC_PATH_PREPROCESSED, 'w') as csvfile:
        writer = csv.DictWriter(csvfile,
                                fieldnames=[
                                    'threshold', 'enriched',
                                    'total_mapped_pct', 'avg_length'
                                ])
        writer.writeheader()
        writer.writerows(results)

    print(23)
コード例 #8
0
def main(use_enriched_sitc: bool, verbose: bool):
    oenace_codes = load_csv(OENACE_FILE_PATH)
    sitc_codes = load_csv(SITC2_FILE_PATH)
    sitc_enriched_codes = load_enriched_sitc_codes(SITC2_ENRICHED_FILE_PATH)

    method = 'stemming'
    metadata = load_metadata(method)

    counter = 0
    total = 0
    oenace_candidates = {}

    print(f'Start time: {datetime.now().isoformat()}')
    for sitc_code, sitc_title in sitc_codes.items():
        total += 1

        if total > 1:
            # Change this to pass if you want to map all the codes
            # Change this to break if you want to stop and use mappings from previous trainings
            break

        if verbose:
            print(f"Findind a mapping for: '{sitc_title}'")

        sitc_title_extendend = [sitc_title]

        # if we want to use enriched version from the correspondence tables, extend sitc titles
        if use_enriched_sitc:
            extending = sitc_enriched_codes.get(sitc_code, [])

            if extending and verbose:
                print(f'Extending "{sitc_title}" with: {extending}')

            sitc_title_extendend += extending

        # hold a list of possible mapping candidates from oenace codes for each method
        oenace_candidates[sitc_code] = {
            'text_similarity': [],
            'inverted_index': [],
            'word_embedding': [],
        }

        # step1: try to do exact name matching
        for oenace_code, oenace_title in oenace_codes.items():
            text_similarity = perform_text_similarity(
                sitc_title=sitc_title, oeance_title=oenace_title,
                should_extend_sitc=use_enriched_sitc,
                sitc_extensions=sitc_title_extendend
            )

            if text_similarity > TEXT_SIMILARITY_THRESHOLD:
                oenace_candidates[sitc_code]['text_similarity'].append({
                    'oenace_code': oenace_code,
                    'oenace_title': oenace_title,
                    'similarity': text_similarity,
                })

        # step2: perform search using tf-idf weighting
        tf_idf_results = perform_exploration(query='.'.join(sitc_title_extendend), method=method, metadata=metadata)
        if tf_idf_results:
            oenace_candidates[sitc_code]['inverted_index'].extend([{
                'oenace_code': item[0],
                'oenace_title': oenace_codes[item[0]],
                'similarity': item[1],
            } for item in tf_idf_results])

        # step 3: word embedding
        word2vec_similarity = Word2VecSimilarity(verbose=verbose)

        for oenace_code, oenace_title in oenace_codes.items():

            if use_enriched_sitc:
                word_embedding_similarity = word2vec_similarity.enriched_sitc_similarity(sitc_title_extendend,
                                                                                         oenace_title)
            else:
                word_embedding_similarity = word2vec_similarity.text_similarity(sitc_title, oenace_title)

            # append all as possible candidates, but in the end choose only top 3 with minimum distance
            oenace_candidates[sitc_code]['word_embedding'].append({
                'oenace_code': oenace_code,
                'oenace_title': oenace_title,
                'similarity': word_embedding_similarity,
            })

        # sort by similiarity descending
        for method in ['text_similarity', 'inverted_index']:
            if oenace_candidates[sitc_code][method]:
                oenace_candidates[sitc_code][method] = sort_by_similarity(
                    oenace_candidates[sitc_code][method],
                    descending=True
                )
        oenace_candidates[sitc_code]['word_embedding'] = sort_by_similarity(
            oenace_candidates[sitc_code]['word_embedding'], descending=False
        )[:100]

        # find intersections from all steps
        intersections = find_matching_intersections(oenace_candidates[sitc_code])

        if intersections:
            counter += 1
            print(f"\nFindind a mapping for: '{sitc_title}'")
            for val in intersections:
                print(f' - {val}')
            print(end='\n\n')

        print(f'Done {total}/{len(sitc_codes)} so far')

    click.echo(f'Found a total of {counter} mappings from {total}')

    print(f'End time: {datetime.now().isoformat()}')

    start_gui(sitc_codes=sitc_codes, oeance_codes=oenace_codes, oenace_candidates=oenace_candidates)
コード例 #9
0
def main(verbose: bool):
    oenace_codes = load_csv(OENACE_FILE_PATH)
    sitc_codes = load_csv(SITC2_FILE_PATH)
    sitc_enriched_codes = load_enriched_sitc_codes(SITC2_ENRICHED_FILE_PATH)

    results = []

    for use_enriched_sitc in [True, False]:
        for method in ['lemmatizing', 'stemming']:
            metadata = load_metadata(method)

            counter = 0
            total = 0
            oenace_candidates = {}

            for sitc_code, sitc_title in sitc_codes.items():
                total += 1

                sitc_title_extendend = [sitc_title]

                # if we want to use enriched version from the correspondence tables, extend sitc titles
                if use_enriched_sitc:
                    extending = sitc_enriched_codes.get(sitc_code, [])

                    if extending and verbose:
                        print(f'Extending "{sitc_title}" with: {extending}')

                    sitc_title_extendend += extending

                # hold a list of possible mapping candidates from oenace codes for each method
                oenace_candidates[sitc_code] = {
                    'text_similarity': [],
                    'inverted_index': [],
                }

                # step2: perform search using tf-idf weighting
                tf_idf_results = perform_exploration(
                    query='.'.join(sitc_title_extendend),
                    method=method,
                    metadata=metadata)
                if tf_idf_results:
                    oenace_candidates[sitc_code]['inverted_index'].extend([{
                        'oenace_code':
                        item[0],
                        'oenace_title':
                        oenace_codes[item[0]],
                        'similarity':
                        item[1],
                    } for item in tf_idf_results])

            total_mapped = 0
            length_of_mapped_items = 0
            for sitc_code in oenace_candidates.keys():
                if oenace_candidates[sitc_code].get('inverted_index'):
                    total_mapped += 1
                    length_of_mapped_items += len(
                        oenace_candidates[sitc_code]['inverted_index'])

            results.append({
                'method':
                method,
                'enriched':
                use_enriched_sitc,
                'total_mapped_pct':
                round((total_mapped / len(sitc_codes)) * 100, 2),
                'avg_length':
                round(length_of_mapped_items / total_mapped, 2)
            })

    import csv
    SITC_PATH_PREPROCESSED = '../data/chart_results/lemavsstem.csv'
    with open(SITC_PATH_PREPROCESSED, 'w') as csvfile:
        writer = csv.DictWriter(csvfile,
                                fieldnames=[
                                    'method', 'enriched', 'total_mapped_pct',
                                    'avg_length'
                                ])
        writer.writeheader()
        writer.writerows(results)

    print(23)