Example #1
0
def add_addresses(directory_path,
                  master_df_path,
                  master_df_id_col,
                  master_df_address_cols,
                  output_directory,
                  new_address_col_name,
                  id_col=None):
    files = get_files(directory_path, "csv")
    print(str(len(files)) + " files found in directory")
    ext = master_df_path.split(".")[-1]
    if (ext == "csv"):
        master_df = pd.read_csv(master_df_path)
    elif (ext == "xlsx"):
        master_df = pd.read_excel(master_df_path)
    else:
        raise ValueError("Invalid file extension: " + ext)
    for file in files:
        print("Current file: " + file, end='\r')
        add_address(file_path=file,
                    master_df=master_df,
                    master_df_id_col=master_df_id_col,
                    master_df_address_cols=master_df_address_cols,
                    output_directory=output_directory,
                    new_address_col_name=new_address_col_name,
                    id_col=id_col)
Example #2
0
 def run(self, day = global_define.TODAY):
     _dir = os.path.join(global_define.TEXT_DIR, str(day))
     file_paths = functs.get_files(_dir)
     for file_path in file_paths:
         self.build(file_path)
     self.flush()
     pass
Example #3
0
 def run(self, day = global_define.TODAY):
     _dir = os.path.join(global_define.XML_DIR, str(day))
     file_paths = functs.get_files(_dir)
     for i,file_path in enumerate(file_paths):
         xml_dicts = self.parse(file_path)
         if xml_dicts:
             self.dump(xml_dicts, str(i), day)
     pass
Example #4
0
def get_subdirectory_files(file_string):
  subdirectory_csv_files = []
  _, subdirectories, _ = os.walk(os.getcwd()).next()
  for subdirectory in subdirectories:
    subdirectory_path = os.path.join(os.getcwd(), subdirectory)
    for subdirectory_csv_file in utilities.get_files(subdirectory_path, file_string):
      subdirectory_csv_files.append(subdirectory_csv_file)
  return subdirectory_csv_files
Example #5
0
def combine_directory(directory_path,
                      text_directory,
                      text_directory_relative,
                      output_directory,
                      address_col,
                      time_col_name,
                      company_name,
                      output_filename=None,
                      current_date=None,
                      date_col_name="date",
                      year_column_name="year",
                      rating_column_name="stars",
                      id_rule="col",
                      id_col="Store Company ID"):
    files = get_files(directory_path, "csv")
    records = []
    string_with_date = directory_path.split("\\")[-1]
    for file in files:
        df = pd.read_csv(file)
        # get id
        if id_rule == "col":
            company_id = list(df[id_col])[0]
        elif id_rule == "filename":
            file_split = file.split("\\")
            if (len(file_split[-1]) == 0):
                filename = file_split[-2]
            else:
                filename = file_split[-1]
            company_id = re.search(r"\d{10}", filename).group(0)

        # process dates
        date_converter(df, time_col_name, string_with_date, date_col_name,
                       current_date, year_column_name)

        # create record and append to list
        records.append(
            create_record(df,
                          company_id,
                          df[address_col][0],
                          company_name,
                          text_directory,
                          text_directory_relative,
                          year_column=year_column_name,
                          rating_column=rating_column_name))

    combined_df = pd.DataFrame.from_records(records)
    if output_filename is None:
        final_filename = string_with_date + ".csv"
    else:
        final_filename = output_filename + ".csv"
    final_outdir = output_directory + "\\" + final_filename
    combined_df.to_csv(final_outdir, index=False)
Example #6
0
def main():

    site = 'guardian'

    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    print('Load and concatenate files')

    dict_srcs = [
        x for x in utilities.get_files(utilities.blm_html_1pass_dir)
        if 'guardian' in x
    ]
    dates_articles_ = utilities.combine_dicts(dict_srcs)
    utilities.count_articles(dates_articles_)

    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    print('Parse comments')

    dates_articles = copy.deepcopy(dates_articles_)
    for date, days_articles in dates_articles_.items():
        for ix, article in enumerate(days_articles):
            raw_comments_pages = article['raw_comments']
            parsed_comments_li = []
            for raw_comments_page in raw_comments_pages:
                raw_comments_soup = bs(raw_comments_page)
                parsed_comments = get_page_comment_data(raw_comments_soup)
                parsed_comments_li += parsed_comments
            dates_articles[date][ix]['parsed_comments'] = parsed_comments_li

    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    print('Save outputs, one json per year')

    years = sorted(list(set([x.split('-')[0] for x in dates_articles.keys()])))
    dst_dir = os.path.join(utilities.blm_processed_parsed_dir, '2nd_iteration')
    for year in years:
        dates_articles2 = {}
        dst = os.path.join(dst_dir, site + '_' + year + '.json')
        for date, days_articles in dates_articles.items():
            if year in date:
                date2 = copy.deepcopy(date)
                dates_articles2[date2] = copy.deepcopy(days_articles)
        with open(dst, 'w') as f:
            json.dump(dates_articles2, f)
Example #7
0
def mean_weight():
    import pandas as pd
    from config import Config, get_services, recreate_config_file
    import sys
    from utilities import Print_Error, Select_Menu, get_files
    from data_manager import Generate_Pivot_Table

    # pd.set_option('display.max_rows',3000)

    available_services = get_services()
    available_files = get_files()

    if (len(available_services) == 0):
        Print_Error(".config File Not Found! Recreating .config File!")
        recreate_config_file()
        return

    # Use parameters from command prompt

    if (len(sys.argv) == 1):

        file_name = Select_Menu(available_files,
                                text="Input File Name",
                                return_type=int)
        service = Select_Menu(available_services,
                              text="Input Service",
                              return_type=int)
        file_name = available_files[(file_name)]
        service = available_services[(service)]

    vessel_name = file_name.upper().split('/')[-1].rsplit('.', 1)[0]
    config = Config()
    import os
    config.build_config("%s/CONFIG/%s.config" % (os.getcwd(), service.upper()))
    config.set_vessel(vessel_name.rsplit('.', 1)[0])
    config.print_data()

    Generate_Pivot_Table(config, file_name)
Example #8
0
def evaluate_all(data_path,
                 scraped_directory_path,
                 output_directory_path,
                 counts_output_path,
                 id_col,
                 data_count_cols,
                 wait=[1, 1.5],
                 url_col=None,
                 search_terms=None,
                 new_url_col=None,
                 scraped_col_name="reviews_scraped",
                 review_col_name="review_count",
                 diff_col="difference",
                 prop_col="proportion"):
    count_df = get_review_counts(data_path, id_col, counts_output_path, wait,
                                 url_col, search_terms, new_url_col,
                                 review_col_name)

    for file in get_files(scraped_directory_path, "csv"):

        scraped_path = file
        scraped_df = pd.read_csv(scraped_path)
        out_df = pd.merge(count_df, scraped_df, on=[id_col])

        total_review_count = out_df[data_count_cols].sum(axis=1)

        out_df[diff_col] = total_review_count - out_df[review_col_name]
        out_df[prop_col] = total_review_count / out_df[review_col_name]
        out_df[scraped_col_name] = total_review_count

        out_df = out_df[[
            id_col, scraped_col_name, diff_col, prop_col, review_col_name
        ]]

        filename = file.split("\\")[-1].split(".")[0]
        output_path = output_directory_path + "\\" + filename + "_evaluation.csv"
        out_df.to_csv(output_path, index=False)
        print("Saved file as " + output_path)
def main():

    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    print('Parse comments')

    # combine scraped blm data
    blm_srcs = utilities.get_files(
        utilities.blm_html_1pass_dir) + utilities.get_files(
            utilities.blm_html_2pass_dir)
    blm_srcs = [x for x in blm_srcs if site in x]
    blm = utilities.combine_dicts(blm_srcs)

    # sort by date
    blm_sorted = {}
    for k in sorted(blm.keys()):
        blm_sorted_v = copy.deepcopy(blm[k])
        blm_sorted[k] = blm_sorted_v

    article_counter = 0
    # iterate over websites
    counter = 0
    # for each website, iterate over intermediate pickle files
    blm_comments = {}

    # for each loaded dict, iterate over each day's articles
    for date, days_articles in blm_sorted.items():
        days_articles_li = []
        # for each article, get cleaned comments
        for ix, article in enumerate(days_articles):
            article_copy = copy.deepcopy(
                article
            )  # copy so when we change article we don't change blm_sorted article
            comments = article_copy['comments']
            # exclude articles that don't have comments
            if type(comments) == str:
                soup = bs(comments)
                ap = ArticlePosts(comments)
                article_copy['unparsed_comments'] = ap.make_unparsed_comments(
                    ap.replacements)
                article_copy['parsed_comments'] = ap.make_parsed_comments()
                article_copy['raw_comments'] = article_copy.pop('comments')
                days_articles_li.append(article_copy)
                counter += 1
                article_counter += 1

        # only add day's articles if there were any articles for that day
        if len(days_articles_li) > 0:
            blm_comments[date] = days_articles_li

        # at the end of the day, if counter greater than x, save and reinitialize dictionary and counter
        if counter >= 1000:
            save_outputs(blm_comments, dst_dir, site)
            # re-initialize dictionary and counter
            blm_comments = {}
            counter = 0

    # save remainders
    if len(blm_comments) > 0:
        save_outputs(blm_comments, dst_dir, site)

    print('Total articles with comments = %s' % article_counter)
Example #10
0
 def merge_incremental_index(self, day = global_define.TODAY):
     _dir = os.path.join(global_define.INDEX_INCREMENTAL_DIR, str(day))
     file_list = functs.get_files(_dir)
     for file_path in file_list:
         self._merge(file_path)     
     pass
Example #11
0
import utilities
import numpy as np
import os
import re

export_fol = "D:\\Apostolis\\Programming\\Python\\Athena_project\\EEG_Project\\data\\export"
answer_fol = "D:\\Apostolis\\Programming\\Python\\Athena_project\\EEG_Project\\data\\answers"
save_fol = "D:\\Apostolis\\Programming\\Python\\Athena_project\\EEG_Project\\data\\subjects"

export_files = utilities.get_files(export_fol)
answer_files = utilities.get_files(answer_fol)

for file in export_files:
    # Get subject names from file name
    sub_name = utilities.get_filename(file, start=4)

    # Get time zero (t0) of the subject from the info.csv file.
    # t0 represents the exact ms that the eeg recording started according to iMotions export file
    with open(os.path.join(save_fol, sub_name, 'info.csv')) as csv:
        lines = csv.readlines()
        line = lines[1].split(',')
        t0 = int(re.sub("[^0-9]", "", line[2]))

    f = open(file)
    text = f.readlines()
    f.close()
    time_zero = 0
    for ind, line in enumerate(text):
        # Search in text for this particular line which marks the start of the web application
        if "NavigateComplete\thttp://localhost/exp/main.php" in line:
            line = text[ind - 2].split('\t')
Example #12
0
    parser.add_argument('-i',
                        '--image_topic',
                        dest='image_topic',
                        default=None,
                        help='Use specified image topic.')
    parser.add_argument('--save_stats',
                        action='store_true',
                        help='Save stats to csv file.')
    parser.add_argument('--make_plots',
                        type=bool,
                        default=True,
                        help='Make pdf of plots of results.')

    args, args_unkonown = parser.parse_known_args()

    bag_directory = args.bag_directory
    if bag_directory == None:
        bag_directory = os.getcwd()

    if not os.path.exists(bag_directory):
        print("Bag directory {} does not exist.".format(bag_directory))
        exit()
    bag_files = utilities.get_files(bag_directory, '*.bag')
    print("Found {} bag files in {}.".format(len(bag_files), bag_directory))

    output_dir = utilities.create_directory(args.output_directory)
    print('Output directory for results is {}'.format(output_dir))

    bag_sweep(bag_files, output_dir, args)
    combine_results_in_csv_file(bag_files, output_dir)
Example #13
0
 def __init__(self):
     dict.__init__(self)
     files = functs.get_files(global_define.INDEX_PRIME_DIR)
     for file_path in files:
         self._load(file_path)
Example #14
0
def Deposito():
    import pandas as pd
    from utilities import Print_Error,get_files,Select_Menu,create_directory,OpenFile



    available_file = []

    search_locations = []
    save_location = ""
    main_config = open("CONFIG/MAIN.config")
    for line in main_config:
        if (line.split(';')[0] == "search_location"):
            search_locations.append(line.split(';')[1].strip())
        elif (line.split(';')[0] == "save_location"):
            save_location=line.split(';')[1].strip()

    for file in get_files(search_locations):
        if(file.upper().endswith('.XLS')):
            available_file.append(file)
    file_name = Select_Menu(available_file,"Select a File",return_type=int)
    file_name = available_file[(file_name)]




    if (file_name.upper().endswith(".XLS")):
        print "Importing XLS File!"
        sheet = "LinnerBooking"
        df = pd.read_excel(io=file_name, sheet_name=sheet)

        df = df[['Booking','Deposito','Weight','Tipo Ctr']]

        df = df.loc[(df['Deposito'] == "MEDLOG SAN ANTONIO") | (df['Deposito'] == "SITRANS SAI ALTO DEPOT")
        | (df['Deposito'] == "SITRANS VALPARAISO DEPOT")
        |(df['Deposito'] == "MEDLOG SANTIAGO")]

        df['Weight'] = df['Weight']/1000 #Transformar a Tons.
        # df = df.loc[(df['Tipo Ctr'] == '20DV') | (df['Tipo Ctr'] == '40DV') | (df['Tipo Ctr'] == '40HC')]
        table = pd.pivot_table(df,values='Weight',aggfunc='count',index='Deposito',columns='Tipo Ctr')
        table = table.reindex(columns=['20DV', '40DV', '40HC'])

        table = table.rename(index={'MEDLOG SAN ANTONIO':'SAI','SITRANS SAI ALTO DEPOT':'SAI',
                                    'SITRANS VALPARAISO DEPOT':'VAP','MEDLOG SANTIAGO':'STGO'})

        table = table.groupby('Deposito').sum()
        # print table.iloc[0]['20DV']



        import openpyxl
        import os

        wb = openpyxl.Workbook()
        sheet = wb.active

        list = []



        print table


        data = []

        for y in range(len(table.index)):
            data.append([])
            for x in range(len(table.columns)):
                data[-1].append(table.iloc[y][x])

        x = 1
        z = 0
        for deposit in data:
            r = 0
            sheet.cell(1,x,str(table.index[z]))
            for value in deposit:
                sheet.cell(2,x,str(table.columns[r]))
                sheet.cell(3,x,float(value))
                x+=1
                r+=1
            x+=1
            z+=1



        wb.save('demo.xlsx')
        wb.close()
        import subprocess




        if (save_location == ""):
            print "Saving Output in Program Location!"
        elif (not os.path.exists(save_location)):
            Print_Error("Save Directory Not Found!")
            create_directory(save_location)



        try:
            table.to_excel(save_location+'/file_output.xlsx')
            print "Saved Succesfully"
        except:
            Print_Error('Error Saving File!')


        directory =  os.getcwd() + '/demo.xlsx'
        OpenFile(directory)


    else:
        Print_Error("File not compatible!")
Example #15
0
        "--image_topic",
        dest="image_topic",
        default=None,
        help="Use specified image topic.",
    )
    parser.add_argument("--save_stats",
                        action="store_true",
                        help="Save stats to csv file.")
    parser.add_argument("--make_plots",
                        type=bool,
                        default=True,
                        help="Make pdf of plots of results.")

    args, args_unkonown = parser.parse_known_args()

    bag_directory = args.bag_directory
    if bag_directory == None:
        bag_directory = os.getcwd()

    if not os.path.exists(bag_directory):
        print(("Bag directory {} does not exist.".format(bag_directory)))
        exit()
    bag_files = utilities.get_files(bag_directory, "*.bag")
    print(("Found {} bag files in {}.".format(len(bag_files), bag_directory)))

    output_dir = utilities.create_directory(args.output_directory)
    print(("Output directory for results is {}".format(output_dir)))

    bag_sweep(bag_files, output_dir, args)
    combine_results_in_csv_file(bag_files, output_dir)
Example #16
0
    combined_dataframes = pd.DataFrame(None, None, names)
    for dataframe in dataframes:
        trimmed_dataframe = pd.DataFrame(dataframe.transpose().values[1:2],
                                         columns=names)
        combined_dataframes = combined_dataframes.append(trimmed_dataframe,
                                                         ignore_index=True)
    return combined_dataframes


def average_results(directory, csv_files):
    combined_dataframes = combined_results(csv_files)
    names = combined_dataframes.columns
    mean_dataframe = pd.DataFrame()
    for name in names:
        mean_dataframe[name] = [combined_dataframes[name].mean()]
    averaged_results_file = os.path.join(directory, "averaged_results.csv")
    mean_dataframe.to_csv(averaged_results_file, index=False)


# Averages results from all *stats.csv files in a directory (including subdirectories).
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("directory",
                        help="Full path to directory where results files are.")
    args = parser.parse_args()
    results_csv_files = utilities.get_files(args.directory, "*stats.csv")
    if not results_csv_files:
        print("Failed to find stats.csv files")
        exit()
    average_results(args.directory, results_csv_files)
Example #17
0
                        action='store_true',
                        help='Write parameter sweep values to a table in pdf')

    args = parser.parse_args()
    if args.write_values_table and not args.write_values:
        print("If write_values_table enabled, write_values must be as well.")
        exit()

    directory = args.directory
    if directory == None:
        directory = os.getcwd()

    dataframes = []

    results_filestring = '*results.csv'
    results_files = utilities.get_files(directory, results_filestring)
    if len(results_files) == 0:
        print("No results csv files found in directory " + directory)
        exit()

    dataframes.append(plot_creator.load_dataframe(results_files))

    if args.write_values:
        values_filestring = '*values.csv'
        values_files = utilities.get_files(directory, values_filestring)
        values_dataframe = plot_creator.load_dataframe(values_files)
        values_dataframe.columns.name = 'values'
        dataframes.append(values_dataframe)

    pdf_filename = 'result_plots.pdf'
    plot_creator.create_pdf(dataframes, pdf_filename, args.write_values_table,
Example #18
0
 def load_prime_index(self):
     file_list = functs.get_files(self.__PRIME_INDEX_DIR)
     for file_path in file_list:
         self._merge(file_path)
Example #19
0
import datetime
import requests
import urllib3
import pickle
import calendar
from collections import defaultdict
import numpy as np


today_dt = datetime.date.today()
yesterday_dt = today_dt - datetime.timedelta(days=1)
dates_ds = pd.date_range(inputs.start_date, inputs.end_date)
dates = [str(x.date()).replace('-', '/') for x in list(dates_ds)]
site = 'breitbart'
articles_dir = utilities.data_2018_dir
article_srcs = [x for x in utilities.get_files(articles_dir, extensions=['json']) if site in x]
blm_dir = utilities.blm_dir
blm_html_1pass_dir = utilities.blm_html_1pass_dir



# instantiate driver
def instantiate_driver(wait=10, url='https://google.com', headless=False):
    chrome_options = Options()
    if headless:
        chrome_options.add_argument("--headless")
        chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--no-sandbox')
    driver_path = utilities.chromedriver_path
    driver = webdriver.Chrome(driver_path, chrome_options=chrome_options)
    driver.get(url)
Example #20
0
def main():
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    print(
        'Use google custom search api to retrieve on-topic articles for selected site'
    )
    # indicate the site on the google custom search api control panel

    # enter credentials from config.py file
    # if this file doesn't exist, create one and define google_custom_search_cx and developerKey variables
    cx = config.google_custom_search_cx
    developerKey = config.google_custom_search_developer_key
    service = build("customsearch", "v1", developerKey=developerKey)

    # make date ranges
    begin_mo, end_mo = [
        list([
            str(x.date()).replace('-', '')
            for x in pd.date_range('2013-07-01', '2018-08-31', freq=x)
        ]) for x in ['MS', 'M']
    ]
    dates = [
        ':'.join(['date', 'r', x[0], x[1]])
        for x in list(zip(begin_mo, end_mo))
    ]

    # define topics
    topics = ['black lives matter', 'police brutality']

    # iterate over each topic, srp, and dates
    starts = np.arange(1, 100, 10)
    res_li = []
    for topic in topics:
        for start in starts:
            for date in dates:
                if start == 91:
                    num = 9
                else:
                    num = 10
                res = service.cse().siterestrict().list(q=topic,
                                                        cx=cx,
                                                        hl='lang_en',
                                                        lr='lang_en',
                                                        num=num,
                                                        start=start,
                                                        sort=date).execute()
                res_li.append(res)
    if overwrite:
        with open(res_dst, 'wb') as f:
            pickle.dump(res_li, f)

    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    print('Build final dataframe and save to disk')
    df_res = []
    for srp in res_li:
        try:
            articles = srp['items']
            for article in articles:
                df_res.append([article['title'], article['link']])
        except:
            pass

    df_res = pd.DataFrame(df_res,
                          columns=['title', 'link'
                                   ]).drop_duplicates().set_index('title')
    if overwrite:
        df_res.to_pickle(os.path.join(dst_dir, site + '_blm_links.pkl'))

    preview(df_res)

    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    print('Filter out off-topic articles and save that to disk')

    # combine dictionaries
    articles_dir = utilities.data_2018_dir
    article_srcs = [
        x for x in utilities.get_files(utilities.data_2018_dir,
                                       extensions=['json']) if site in x
    ]
    dates_articles_ = utilities.combine_dicts(article_srcs)

    links = [
        x.lower().split('https://www.')[-1]
        for x in df_res.link.unique().tolist()
    ]
    dates_articles = {}
    for date, days_articles in dates_articles_.items():
        articles = []
        for article in days_articles:
            article2 = copy.deepcopy(article)
            try:
                link = article2['url'].lower().strip().split('http://www.')[-1]
            except:
                link = ''
            if link in links:
                articles.append(article2)
        if len(articles) > 0:
            dates_articles[date] = articles

    articles_dst = os.path.join(dst_dir, site + '_articles.pkl')
    with open(articles_dst, 'wb') as f:
        pickle.dump(dates_articles, f)
Example #21
0
import os
import utilities
from shutil import copy
import csv
import numpy as np

# Get current answer files
ans_fol = "..\\..\\data\\answers"
ans_paths = utilities.get_files(ans_fol)
subs = [utilities.get_filename(i) for i in ans_paths]

# Get export files (iMotion file)
exp_fol = "..\\..\\data\\export"
exp_paths = utilities.get_files(exp_fol)

# Create subject folders if they do not exist
sub_fol = "..\\..\\data\\subjects"
for ans_path, sub in zip(ans_paths, subs):
    folder = os.path.join(sub_fol, sub)

    # Check if folder already exists
    if not os.path.isdir(folder):
        # Create folder
        os.mkdir(folder)

        # Copy answer file to subject folder
        copy(ans_path, os.path.join(folder, 'answers.txt'))

        # Find and copy the correct export file to subject folder
        exp_ind = utilities.search_string_in_list(exp_paths, sub)
        copy(exp_paths[exp_ind], os.path.join(folder, 'export.txt'))
Example #22
0
def main():

    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # DEFINE INPUTS AND LOAD DATA

    today_dt = datetime.date.today()
    yesterday_dt = today_dt - datetime.timedelta(days=1)
    dates_ds = pd.date_range(inputs.start_date, inputs.end_date)
    dates = [str(x.date()).replace('-', '/') for x in list(dates_ds)]
    overwrite = inputs.overwrite
    site = 'guardian'

    src = os.path.join(utilities.blm_dir, 'Google_CSE_Results',
                       site + '_articles.pkl')
    with open(src, 'rb') as f:
        dates_articles_ = pickle.load(f)
    interim_dir = os.path.join(utilities.blm_dir, 'z_Interim')
    utilities.mkdir(interim_dir)

    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    print('Pull comment URLs')

    if overwrite:
        base_url = 'https://www.theguardian.com/discussion'
        dates_articles = copy.deepcopy(dates_articles_)
        counter = 0
        for date, days_articles in dates_articles.items():
            for ix, article in enumerate(days_articles):
                try:
                    article_url = article['url'].strip().lower()
                    r = requests.get(article_url)
                    article_soup = bs(r.text)
                    comments_div = article_soup.find('div', {'id': 'comments'})
                    soup_id = comments_div.attrs['data-discussion-key']

                    comments_url = base_url + soup_id
                    dates_articles[date][ix]['comments_url'] = comments_url
                except:
                    dates_articles[date][ix]['comments_url'] = 'no comments'

        dates_articles_dst = os.path.join(interim_dir,
                                          'articles_w_comments_urls.pkl')
        with open(dates_articles_dst, 'wb') as f:
            pickle.dump(dates_articles, f)
    else:
        with open(dates_articles_dst, 'rb') as f:
            dates_articles = pickle.load(f)

    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    print('Remove articles with no comments')

    dates_articles2 = {}
    for date, days_articles in dates_articles.items():
        articles = []
        for article in days_articles:
            if article['comments_url'] != 'no comments':
                article_copy = copy.deepcopy(article)
                articles.append(article_copy)
        if len(articles) > 0:
            dates_articles2[date] = articles

    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    print('Scrape comments pages')

    if overwrite:
        dates_articles3 = {}
        counter = 0
        file_counter = 0
        for date, days_articles in dates_articles2.items():
            articles3 = []
            for article in days_articles:
                comments_url = article['comments_url']
                comments_li = []

                try:
                    comments_soup, comments = get_page_comments(comments_url)
                    comments_li.append(comments)
                    next_page_comments_url = get_next_page_url(comments_soup)

                    while next_page_comments_url is not None:
                        try:
                            next_page_comments_soup, next_page_comments = get_page_comments(
                                next_page_comments_url)
                            comments_li.append(next_page_comments)
                            next_page_comments_url = get_next_page_url(
                                next_page_comments_soup)
                        except:
                            next_page_comments_url = None
                except:
                    pass
                article3 = copy.deepcopy(article)
                article3['raw_comments'] = comments_li
                articles3.append(article3)
            if len(articles3) > 0:
                dates_articles3[date] = articles3

            counter += 1
            if counter >= 10:
                dst = os.path.join(utilities.blm_html_1pass_dir,
                                   site + str(file_counter) + '.pkl')
                with open(dst, 'wb') as f:
                    pickle.dump(dates_articles3, f)
                dates_articles3 = {}
                counter = 0
                file_counter += 1

        if counter > 0:
            dst = os.path.join(utilities.blm_html_1pass_dir,
                               site + str(file_counter + 1) + '.pkl')
            with open(dst, 'wb') as f:
                pickle.dump(dates_articles3, f)

    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    print('Print outputs')

    recovered_articles_srcs = [
        x for x in utilities.get_files(utilities.blm_html_1pass_dir)
        if site in x
    ]
    recovered_articles = utilities.combine_dicts(recovered_articles_srcs)
    n_articles = utilities.count_articles(recovered_articles)
    print('Recovered %s on-topic articles with comments' % n_articles)
Example #23
0
import utilities
import numpy as np
import os
import re
import sys

answer_fol = "..\\..\\data\\answers"
save_fol = "..\\..\\data\\subjects"

answer_files = utilities.get_files(answer_fol)

file = sys.argv[1]
# Get subject names from file name
sub_name = utilities.get_filename(file, start=4)

# Get time zero (t0) of the subject from the info.csv file.
# t0 represents the exact ms that the eeg recording started according to iMotions export file
with open(os.path.join(save_fol, sub_name, 'info.csv')) as csv:
    lines = csv.readlines()
    line = lines[2].split(',')
    t0 = int(re.sub("[^0-9]", "", line[2]))

f = open(file)
text = f.readlines()
f.close()
time_zero = 0
for ind, line in enumerate(text):
    # Search in text for this particular line which marks the start of the web application
    if "NavigateComplete\thttp://localhost/exp/main.php" in line:
        line = text[ind - 2].split('\t')
        time_zero = int(line[9])