def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--word2vec', type=str, required=True)
    parser.add_argument('--output_dir', type=str, required=True)
    args = parser.parse_args()

    df = pickle.load(open(args.word2vec, 'rb'))
    words_map = {v:k for k,v in enumerate(df['word'].values)}
    vectors_matrix = np.array([list(x) for x in df['vector'].values])

    make_sure_path_exists(args.output_dir)

    with open(os.path.join(args.output_dir, 'words_map.json'), 'w') as f:
        json.dump(words_map, f)

    with open(os.path.join(args.output_dir, 'vectors_matrix.pkl'), 'wb') as f:
        pickle.dump(vectors_matrix, f)
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--dataframes', required=True, type=str, help='The pickle file containing the mapping between '
                                                                      'term ids and the renormalised '
                                                                      'dataframes.')
    parser.add_argument('--terms', required=True, type=str, help='The JSON file containing the mapping from terms to '
                                                                 'term ids.')
    parser.add_argument('--output_dir', required=True, type=str, help='The output directory.')
    args = parser.parse_args()

    df_dict = pickle.load(open(args.dataframes, 'rb'))
    terms_dict = json.load(open(args.terms, 'r', encoding='utf8'))
    result_dict = {x: df_dict[terms_dict[x]].rename(columns={terms_dict[x]: x}) for x in terms_dict}

    make_sure_path_exists(args.output_dir)

    with open(os.path.join(args.output_dir, 'final_df_dict.pkl'), 'wb') as f:
        pickle.dump(result_dict, f)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--page_files', type=str, required=True)
    parser.add_argument('--keep_all_updates', action='store_true')
    parser.add_argument('--save_originals', action='store_true')
    parser.add_argument('--output_dir', type=str, required=True)
    args = parser.parse_args()

    list_of_filenames = [os.path.join(args.page_files, x) for x in os.listdir(args.page_files)]
    all_pages = crawl_entire_page(list_of_filenames)
    parsed_ads = parse_all_pages(all_pages, args.keep_all_updates)
    make_sure_path_exists(args.output_dir)

    with open(os.path.join(args.output_dir, 'all_ads.json'), 'w') as f:
        json.dump(parsed_ads, f)

    if args.save_originals:
        with open(os.path.join(args.output_dir, 'jobs_rss_raw.json'), 'w') as f:
            json.dump(all_pages, f)
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--output_dir', type=str, required=True)
    parser.add_argument('--crawl', action='store_true')
    parser.add_argument('--translate', action='store_true')
    args = parser.parse_args()
    if args.crawl or not args.translate:
        jobs_df = get_all_vocations(STARTING_URL, BASE_URL)
        print('crawling jobs completed')
        print(jobs_df.head())
        make_sure_path_exists(args.output_dir)
        with open(os.path.join(args.output_dir, 'skills_fr.pkl'), 'wb') as f:
            pickle.dump(jobs_df, f)
    if args.translate:
        if not args.crawl:
            jobs_df = pickle.load(
                open(os.path.join(args.output_dir, 'skills_fr.pkl'), 'rb'))
        print('starting translation')
        translated_df = translate_df(jobs_df)
        with open(os.path.join(args.output_dir, 'skills_en.pkl'), 'wb') as f:
            pickle.dump(translated_df, f)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--posts', type=str, required=True)
    parser.add_argument('--num_top', type=int, default=1)
    parser.add_argument('--output_dir', type=str, required=True)
    args = parser.parse_args()

    conf = SparkConf().set("spark.driver.maxResultSize", "10G"). \
        set("spark.hadoop.validateOutputSpecs", "false"). \
        set('spark.default.parallelism', '400')

    spark = SparkSession.builder.\
        appName("SO Tag first usage date").\
        config(conf=conf).\
        getOrCreate()

    sc = spark.sparkContext

    in_rdd = sc.textFile(args.posts).filter(lambda x: get_field(x, 'Id') is not None).\
                                map(lambda x: (int(get_field(x, 'Id')), x))

    in_rdd = in_rdd.filter(lambda x: get_field(x[1], 'Tags') is not None and get_field(x[1], 'CreationDate') is not None).\
                    map(lambda x: (datetime.strptime(get_field(x[1], 'CreationDate').decode('utf-8'), DT_FORMAT),
                                   get_tags(get_field(x[1], 'Tags').decode('utf-8')))).\
                    flatMap(lambda x: [(x[0], y) for y in x[1]])

    tag_date_df = in_rdd.toDF(['CreationDate', 'Tag'])
    window = Window.partitionBy(tag_date_df['Tag']).orderBy(tag_date_df['CreationDate'].asc())
    #tag_first_appearances = tag_date_df.groupBy('Tag').agg({'CreationDate': 'min'})
    tag_first_appearances = tag_date_df.select('*', rank().over(window).alias('rank')).\
                        filter(col('rank') <= args.num_top)
    tag_first_appearances_pd = tag_first_appearances.toPandas().drop(columns=['rank'])

    make_sure_path_exists(args.output_dir)
    with open(os.path.join(args.output_dir, 'tag_'+str(args.num_top)+'_earliest_appearance.csv'), 'w') as f:
        tag_first_appearances_pd.to_csv(f)
Ejemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--mode',
        choices=['continue', 'start'],
        required=True,
        default='start',
        help='The mode of operation. If you were running this script before and '
        'were interrupted by an error (e.g. rate-limiting), use continue, '
        'otherwise use start.')
    parser.add_argument('--dataframes',
                        required=True,
                        type=str,
                        help='Pickle file containing a term id to dataframe '
                        'dictionary. The dataframes are all normalised '
                        'to have a maximum of 100.')
    parser.add_argument(
        '--time_settings',
        type=str,
        help='The JSON file containing the start and end times of the '
        'time period you are using.')
    parser.add_argument(
        '--state',
        type=str,
        help='Only for continue mode, the saved state file to load.')
    parser.add_argument(
        '--proxy',
        type=str,
        help='Proxy server address if you need to use one. Needs to be HTTPS.')
    parser.add_argument('--sleep_time',
                        type=int,
                        default=1,
                        help='Sleep time between subsequent queries, to '
                        'avoid rate-limiting. If you\'re rate-limited, '
                        'set this to 60 (unit is seconds).')
    parser.add_argument('--output_dir',
                        type=str,
                        required=True,
                        help='Output directory for the resulting pickle file.')
    parser.add_argument(
        '--starting_term',
        type=str,
        help='The term to start from in the renormalisation (which will '
        'be the term that has the maximum value of 100 in the end). '
        'If not provided, the starting term will be random.')
    parser.add_argument(
        '--terms',
        type=str,
        help='If you want to start from a specific term, this JSON needs to be '
        'provided in order to map the term to its term id in Google Trends.')

    args = parser.parse_args()

    if args.mode == 'continue' and (args.state is None
                                    or args.time_settings is not None
                                    or args.starting_term is not None
                                    or args.terms is not None):
        parser.error(
            'In "continue" mode, you should provide a pickle file containing the saved state.'
        )
    if args.mode == 'start' and (args.state is not None
                                 or args.time_settings is None):
        parser.error(
            'In "start" mode, you should provide a time settings json.')
    if args.starting_term is not None and args.terms is None:
        parser.error(
            'When you provide the starting term, you need to provide the dictionary mapping them to their '
            'term ids.')

    if args.proxy is not None:
        proxy = {'https': args.proxy}
    else:
        proxy = None
    pytrends_obj = create_pytrends_obj(proxies=proxy,
                                       sleep_time=args.sleep_time)
    term_dataframe_dict = pickle.load(open(args.dataframes, 'rb'))
    terms_list = list(term_dataframe_dict.keys())

    if args.mode == 'start':
        settings_dict = json.load(
            open(args.time_settings, 'r', encoding='utf8'))
        starting_term = None
        if args.starting_term is not None:
            starting_term = args.starting_term
            term_to_mid = json.load(open(args.terms, 'r'))
            starting_term = term_to_mid[starting_term]

        conversion_ratio_list = find_all_interterm_conversion_rates_start(
            pytrends_obj,
            terms_list,
            settings_dict['time_start'],
            settings_dict['time_end'],
            starting_term=starting_term)
    else:
        saved_state = pickle.load(open(args.state, 'rb'))
        conversion_ratio_list = find_all_interterm_conversion_rates_continue(
            pytrends_obj, saved_state[0], saved_state[1], saved_state[2],
            saved_state[3], saved_state[4], saved_state[5], saved_state[6],
            saved_state[7], saved_state[8], saved_state[9], saved_state[10])

    if conversion_ratio_list is not None:
        renormalisation_dict = compile_final_renormalisation_ratios(
            conversion_ratio_list, terms_list)
        renormalise_all_tags(term_dataframe_dict, renormalisation_dict)

        make_sure_path_exists(args.output_dir)
        with open(os.path.join(args.output_dir, 'renormalised_df_dict.pkl'),
                  'wb') as f:
            pickle.dump(term_dataframe_dict, f)
        with open(os.path.join(args.output_dir, 'conversion_ratios.pkl'),
                  'wb') as f:
            pickle.dump(conversion_ratio_list, f)
    else:
        print(
            'If you have been rate-limited, increasing the sleep time to 60 seconds should do the trick!'
        )
Ejemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--term_list',
                        required=True,
                        type=str,
                        help='A txt file containing one term per line.')
    parser.add_argument('--output_dir',
                        required=True,
                        type=str,
                        help='Output directory for the resulting dictionary.')
    parser.add_argument('--use_original_terms',
                        action='store_true',
                        help='Whether to just use the original terms '
                        'and to avoid going through the suggestions. '
                        'Not recommended.')
    parser.add_argument(
        '--choose_first',
        action='store_true',
        help='If you don\'t feel like going through the entire '
        'list of suggestions for each term, use this '
        'option to always select the first one. '
        'Not recommended.')
    parser.add_argument(
        '--proxy',
        type=str,
        help='Proxy server address if you need to use one. Needs to be HTTPS.')
    parser.add_argument('--sleep_time',
                        type=int,
                        default=0,
                        help='Sleep time between subsequent queries, to '
                        'avoid rate-limiting. If you\'re rate-limited, '
                        'set this to 60 (unit is seconds).')
    args = parser.parse_args()

    if args.use_original_terms and args.choose_first:
        parser.error(
            '--use_original_terms and --choose_first are mutually exclusive')

    terms = open(args.term_list, 'r', encoding='utf8').readlines()
    terms = [x.strip() for x in terms if len(x.strip()) > 0]
    if args.use_original_terms:
        terms_dict = {x: x for x in terms}
    else:
        if args.proxy is not None:
            proxy = {'https': args.proxy}
        else:
            proxy = None
        pytrends_obj = create_pytrends_obj(proxies=proxy,
                                           sleep_time=args.sleep_time)
        terms_dict = dict()
        for term in terms:
            chosen_term = prompt_term_choice(pytrends_obj,
                                             term,
                                             default_choice=args.choose_first)
            if chosen_term is None:
                break
            if chosen_term != '':
                terms_dict[term] = chosen_term

    make_sure_path_exists(args.output_dir)
    with open(os.path.join(args.output_dir, 'term_to_mid.json'),
              mode='w',
              encoding='utf8') as f:
        json.dump(terms_dict, f)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--mode',
        choices=['continue', 'start'],
        required=True,
        default='start',
        help='The mode of operation. If you were running this script before and '
        'were interrupted by an error (e.g. rate-limiting), use continue, '
        'otherwise use start.')
    parser.add_argument('--time_start',
                        type=str,
                        help='Starting point of the time period you want.')
    parser.add_argument('--time_end',
                        type=str,
                        help='Ending point of the time period you want.')
    parser.add_argument(
        '--terms',
        type=str,
        help='The JSON file containing the mapping from terms to term ids.')
    parser.add_argument(
        '--state',
        type=str,
        help='Only for continue mode, the saved state file to load.')
    parser.add_argument(
        '--proxy',
        type=str,
        help='Proxy server address if you need to use one. Needs to be HTTPS.')
    parser.add_argument('--sleep_time',
                        type=int,
                        default=1,
                        help='Sleep time between subsequent queries, to '
                        'avoid rate-limiting. If you\'re rate-limited, '
                        'set this to 60 (unit is seconds).')
    parser.add_argument('--leap_size', type=int, default=1)
    parser.add_argument('--output_dir',
                        type=str,
                        required=True,
                        help='Output directory for the resulting pickle file.')
    args = parser.parse_args()

    if args.mode == 'continue' and (args.terms is not None
                                    or args.state is None):
        parser.error(
            'In "continue" mode, you should provide a pickle file containing the saved state.'
        )
    if args.mode == 'start' and (args.terms is None or args.state is not None
                                 or args.time_start is None
                                 or args.time_end is None):
        parser.error(
            'In "start" mode, you should provide a json file mapping terms to their term ids ("mid"s), '
            'in addition to the start and end times.')

    if args.proxy is not None:
        proxy = {'https': args.proxy}
    else:
        proxy = None
    pytrends_obj = create_pytrends_obj(proxies=proxy,
                                       sleep_time=args.sleep_time)

    if args.mode == 'start':
        terms_dict = json.load(open(args.terms, 'r', encoding='utf8'))
        terms_list = list(terms_dict.values())
        with open(os.path.join(args.output_dir, 'dataframe_settings.json'),
                  'w',
                  encoding='utf8') as f:
            json.dump(
                {
                    'time_start': args.time_start,
                    'time_end': args.time_end
                }, f)
        df_dict = retrieve_all_terms_start(pytrends_obj,
                                           terms_list,
                                           args.time_start,
                                           args.time_end,
                                           leap_size=args.leap_size)
    else:
        saved_state = pickle.load(open(args.state, 'rb'))
        df_dict = retrieve_all_terms_continue(pytrends_obj,
                                              saved_state[0],
                                              saved_state[1],
                                              saved_state[2],
                                              saved_state[3],
                                              saved_state[4],
                                              leap_size=saved_state[5])

    if df_dict is not None:
        make_sure_path_exists(args.output_dir)
        with open(os.path.join(args.output_dir, 'individual_df_dict.pkl'),
                  'wb') as f:
            pickle.dump(df_dict, f)
    else:
        print(
            'If you have been rate-limited, increasing the sleep time to 60 seconds should do the trick!'
        )