def readfile(name):
    with open('./data_store.pkl', 'rb') as handle:
        data_store = pickle.load(handle)

    write_excel = create_excel_file('./results/{}_results.xlsx'.format(name))
    wb = openpyxl.load_workbook(write_excel)
    ws = wb[wb.sheetnames[-1]]
    print_df_to_excel(df=pd.DataFrame(data=data_store[1], columns=data_store[0]), ws=ws)
    wb.save(write_excel)
Example #2
0
        pickle.dump([data_store_columns, countrydataoverall],
                    handle,
                    protocol=pickle.HIGHEST_PROTOCOL)

    with open('IDEAScitystatedata{}.pkl'.format(indextostart), 'wb') as handle:
        pickle.dump([data_store_columns2, citystatedataoverall],
                    handle,
                    protocol=pickle.HIGHEST_PROTOCOL)

    print('Progress: {} out of {} done'.format(authors + indextostart,
                                               len(delimitedlocation)))

    #if authors > 2:
    #    break

write_excel = create_excel_file(
    './results/{}_results.xlsx'.format('IDEAScountrydata'))
wb = openpyxl.load_workbook(write_excel)
ws = wb[wb.sheetnames[-1]]
print_df_to_excel(df=pd.DataFrame(data=countrydataoverall,
                                  columns=data_store_columns),
                  ws=ws)
wb.save(write_excel)

write_excel = create_excel_file(
    './results/{}_results.xlsx'.format('IDEAScitystatedata'))
wb = openpyxl.load_workbook(write_excel)
ws = wb[wb.sheetnames[-1]]
print_df_to_excel(df=pd.DataFrame(data=citystatedataoverall,
                                  columns=data_store_columns2),
                  ws=ws)
wb.save(write_excel)
Example #3
0
        title = 'NA'
        email = 'NA'

    personaldetails.append(name)
    personaldetails.append(title)
    personaldetails.append(email)

    personaldata.append(personaldetails)

    with open('GSaffiliationscrap{}.pkl'.format(indextostart), 'wb') as handle:
        pickle.dump([data_store_columns, personaldata],
                    handle,
                    protocol=pickle.HIGHEST_PROTOCOL)

    print('Progress: {} out of {} for {} done'.format(authors + indextostart,
                                                      numberofauthors, name))

    #if authors > 3:
    #    break

write_excel = create_excel_file(
    './results/{}_results.xlsx'.format('GSAffiliationScrap'))
wb = openpyxl.load_workbook(write_excel)
ws = wb[wb.sheetnames[-1]]
print_df_to_excel(df=pd.DataFrame(data=personaldata,
                                  columns=data_store_columns),
                  ws=ws)
wb.save(write_excel)

elapsed = (time.time() - start) / 3600
print(f"Elapsed time: {elapsed} hours")
Example #4
0
    title = affiliationtable.text
    # print(title)
    verifiedemail = soup.find('div', attrs={'id': 'gsc_prf_ivh'})
    email = verifiedemail.text

    personaldetails.append(title)
    personaldetails.append(email)

    personaldata.append(personaldetails)
    with open('GSauthorduplicate{}.pkl'.format(indextostart), 'wb') as handle:
        pickle.dump([data_store_columns, personaldata],
                    handle,
                    protocol=pickle.HIGHEST_PROTOCOL)

    print('Progress: {} out of {} for {} done'.format(authors + indextostart,
                                                      numberofauthors, name))

    #if authors > 3:
    #    break

write_excel = create_excel_file(
    './results/{}_results.xlsx'.format('GSauthorduplicate'))
wb = openpyxl.load_workbook(write_excel)
ws = wb[wb.sheetnames[-1]]
print_df_to_excel(df=pd.DataFrame(data=personaldata,
                                  columns=data_store_columns),
                  ws=ws)
wb.save(write_excel)

elapsed = (time.time() - start) / 3600
print(f"Elapsed time: {elapsed} hours")
Example #5
0
personaldata = personaldata.append(df7a)
personaldata = personaldata.append(df8)
personaldata = personaldata.append(df9)
personaldata = personaldata.append(df10)
personaldata = personaldata.append(df11)

data_store_columns = ['Name', 'Title', 'Email']

#data_store_columns = ['Name', 'Total Citations', 'Total Citations (5 years)', 'h-index', 'h-index (5 years)',
#                      'i10-index', 'i10-index (5 years)', 'Journal Authors', 'Journal Titles', 'Journal Names',
#                      'Journal Years', 'Number of publications', 'Probability of correct profile']

#data_store_columns = ['Name', 'Total Citations', 'Total Citations (5 years)', 'h-index', 'h-index (5 years)',
#                      'i10-index', 'i10-index (5 years)', 'Journal Authors', 'Journal Titles', 'Journal Names',
#                      'Journal Years', 'Number of publications', 'Probability of correct profile', 'Keywords match',
#                      'Title', 'Email']

with open('GSaffiliationscrapComplete.pkl', 'wb') as handle:
    pickle.dump([data_store_columns, personaldata.values],
                handle,
                protocol=pickle.HIGHEST_PROTOCOL)

write_excel = create_excel_file(
    './results/{}_results.xlsx'.format('GSaffiliationscrapComplete'))
wb = openpyxl.load_workbook(write_excel)
ws = wb[wb.sheetnames[-1]]
print_df_to_excel(df=pd.DataFrame(data=personaldata,
                                  columns=data_store_columns),
                  ws=ws)
wb.save(write_excel)
    personaldata.append(personaldetails)

    with open('GSdatascrap{}.pkl'.format(indextostart), 'wb') as handle:
        pickle.dump([data_store_columns, personaldata],
                    handle,
                    protocol=pickle.HIGHEST_PROTOCOL)

    with open('authorduplicate{}.pkl'.format(indextostart), 'wb') as handle:
        pickle.dump(
            [['Authors with more than 1 profile in GS'], authorsduplicate],
            handle,
            protocol=pickle.HIGHEST_PROTOCOL)

    print('Progress: {} out of {} for {} done'.format(authors + indextostart,
                                                      numberofauthors, name))

    # if authors > 3:
    #    break

write_excel = create_excel_file(
    './results/{}_results.xlsx'.format('GSWebScrap'))
wb = openpyxl.load_workbook(write_excel)
ws = wb[wb.sheetnames[-1]]
print_df_to_excel(df=pd.DataFrame(data=personaldata,
                                  columns=data_store_columns),
                  ws=ws)
wb.save(write_excel)

elapsed = (time.time() - start) / 3600
print(f"Elapsed time: {elapsed} hours")
def pso_ga(func, pmin, pmax, smin, smax, int_idx, params, ga, dv):
    # Setting params
    c1, c2, wmin, wmax, ga_iter_min, ga_iter_max, iter_gamma, ga_num_min, ga_num_max, num_beta,\
    tourn_size, cxpb, mutpb, indpd, eta,\
    pso_iter, swarm_size = \
    params['c1'], params['c2'], params['wmin'], params['wmax'],\
    params['ga_iter_min'], params['ga_iter_max'], params['iter_gamma'],\
    params['ga_num_min'], params['ga_num_max'], params['num_beta'],\
    params['tourn_size'], params['cxpd'], params['mutpd'], params['indpd'], params['eta'],\
    params['pso_iter'], params['swarm_size']

    # int_idx must be a list. If a single number is given, convert to list.
    if isinstance(int_idx,int):
        int_idx = [int_idx]

    creator.create("FitnessMin", base.Fitness, weights=(-1.0,))  # Minimization of a single scalar value
    creator.create("Particle", list, fitness=creator.FitnessMin, speed=list,
                   smin=None, smax=None, best=None, int_idx=None)

    toolbox = base.Toolbox()
    toolbox.register("particle", generate_part, dim=len(pmin), pmin=pmin, pmax=pmax, smin=smin, smax=smax,
                     int_idx=int_idx)
    toolbox.register("population", tools.initRepeat, list, toolbox.particle)
    toolbox.register("update", updateParticle, c1=c1, c2=c2)
    toolbox.register("evaluate", func)

    toolbox.register("mate", tools.cxTwoPoint)
    #toolbox.register("mutate", ga_hybrid_polymutate, low=pmin, up=pmax, indpb=indpd, eta=eta)
    toolbox.register("mutate", ga_hybrid_gaussianmutate, low=pmin, up=pmax, indpb=indpd, sigma=smax)

    pop = toolbox.population(n=swarm_size)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)

    logbook = tools.Logbook()
    logbook.header = ["gen", "evals"] + stats.fields

    best = None
    pso_hof_num = max(1,round(ga_num_min*0.2))
    pso_hof = tools.HallOfFame(pso_hof_num)

    for g in range(pso_iter):
        # PSO segment first
        for part in pop:
            part.fitness.values = toolbox.evaluate(part)
            # Note: Fitness comparisons will compare the weighted value. Since weight is negative,
            # the comparison would be opposite unless you specify .values instead.
            if not part.best or part.best.fitness.values[0] > part.fitness.values[0]:
                part.best = creator.Particle(part)
                part.best.fitness.values = part.fitness.values
            if not best or best.fitness.values[0] > part.fitness.values[0]:
                best = creator.Particle(part)
                best.fitness.values = part.fitness.values

        for part in pop:
            # Linear annealing for inertia velocity coefficient (the w weights)
            toolbox.update(part, best=best, w=wmax - (wmax-wmin)*(g/pso_iter)**iter_gamma)

        if ga:
            # GA segment
            # Start at max and approach min
            ga_pop = round(ga_num_min + (g/pso_iter)**iter_gamma*(ga_num_max-ga_iter_min))
            ga_gen = round(ga_iter_min + (g/pso_iter)**num_beta*(ga_iter_max-ga_iter_min))
            if len(pso_hof) == 0:
                ga_mask = [1 for _ in range(ga_pop)] + [0 for _ in range(swarm_size-ga_pop)]
                random.shuffle(ga_mask)
                population = [x for x,mask in zip(pop, ga_mask) if mask == 1]
            else:
                ga_pop += - pso_hof_num
                ga_mask = [1 for _ in range(ga_pop)] + [0 for _ in range(swarm_size - ga_pop)]
                random.shuffle(ga_mask)
                population = [x for x, mask in zip(pop, ga_mask) if mask == 1] + pso_hof.items

            halloffame = tools.HallOfFame(ga_pop)
            halloffame.update(population)
            ga_eval = 0
            # Begin the generational process
            for gen in range(ga_gen):
                # Select the next generation individuals. Built in tournament selector does not work for multi-objective
                # offspring = toolbox.select(population, len(population))
                # Own selection using tournment. Will work for multi-objective.
                chosen = []
                for i in range(ga_pop):
                    aspirants = selRandom(population, tourn_size)
                    scores = [x.fitness.values[0] for x in aspirants]
                    f = lambda i: scores[i]
                    chosen_idx = min(range(len(scores)), key=f)
                    chosen.append(aspirants[chosen_idx])
                    pass
                offspring = chosen

                # Vary the pool of individuals
                offspring = varAnd(offspring, toolbox, cxpb, mutpb)

                # Evaluate the individuals with an invalid fitness
                invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
                ga_eval += len(invalid_ind)
                fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
                for ind, fit in zip(invalid_ind, fitnesses):
                    ind.fitness.values = fit

                # Update the hall of fame with the generated individuals
                halloffame.update(offspring)

                # Replace the current population by the offspring
                population[:] = offspring

            counter = 0
            if best.fitness.values[0] > halloffame[0].fitness.values[0]:
                best = creator.Particle(halloffame[0])
                best.fitness.values = halloffame[0].fitness.values
            for idx, mask in enumerate(ga_mask):
                if mask == 1:
                    try:
                        if pop[idx].fitness.values[0] > halloffame[counter].fitness.values[0]:
                            pop[idx] = halloffame[counter]
                            pop[idx].best = creator.Particle(part)
                            pop[idx].best.fitness.values = halloffame[counter].fitness.values
                        counter += 1
                    except IndexError:
                        break
        pso_hof.update(pop)

        # Gather all the fitnesses in one list and print the stats
        try:
            logbook.record(gen=g, evals=len(pop) + ga_eval, **stats.compile(pop))
        except UnboundLocalError:
            # Means ga=False and ga_eval is not assigned
            logbook.record(gen=g, evals=len(pop), **stats.compile(pop))
        #print(best)
        print(logbook.stream)

    print(best.fitness.values)

    # Printing to excel
    write_excel = create_excel_file('./results/pso_ga_results.xlsx')
    wb = openpyxl.load_workbook(write_excel)
    ws = wb[wb.sheetnames[-1]]

    ws.cell(1, 1).value = 'Optimal Decision Values'
    print_array_to_excel(dv, (2, 1), ws=ws, axis=1)
    print_array_to_excel(best, (3, 1), ws=ws, axis=1)

    genfit = logbook.select("gen")
    avgfit = logbook.select("avg")
    stdfit = logbook.select("std")
    minfit = logbook.select("min")
    maxfit = logbook.select("max")

    ws.cell(5, 1).value = 'gen'
    ws.cell(6, 1).value = 'avg'
    ws.cell(7, 1).value = 'std'
    ws.cell(8, 1).value = 'min'
    ws.cell(9, 1).value = 'max'

    print_array_to_excel(genfit, (5, 2), ws=ws, axis=1)
    print_array_to_excel(avgfit, (6, 2), ws=ws, axis=1)
    print_array_to_excel(stdfit, (7, 2), ws=ws, axis=1)
    print_array_to_excel(minfit, (8, 2), ws=ws, axis=1)
    print_array_to_excel(maxfit, (9, 2), ws=ws, axis=1)

    wb.save(write_excel)


    return pop, logbook, best
Example #8
0
        if np.isnan(GStitles[i]):
            state1 = 'NA'
            state2 = 'NA'
            country = 'NA'
    except:
        geocode_result = gmaps.geocode(GStitles[i])
        try:
            state1, state2, country = getstatecountry(geocode_result)
        except:
            state1 = 'Cannot retrieve'
            state2 = 'Cannot retrieve'
            country = 'Cannot retrieve'

    state1data.append(state1)
    state2data.append(state2)
    countrydata.append(country)
    #if i == 10:
    #    break

data = {'state1': state1data, 'state2': state2data, 'country': countrydata}

write_excel = create_excel_file(
    './results/{}_results.xlsx'.format('GScitystatedata'))
wb = openpyxl.load_workbook(write_excel)
ws = wb[wb.sheetnames[-1]]
print_df_to_excel(df=pd.DataFrame(data), ws=ws)
wb.save(write_excel)

elapsed = (time.time() - start) / 3600
print(f"Elapsed time: {elapsed} hours")
Example #9
0
    personaldata.append(personaldetails)

    print('Progress: {} out of {} for {} done'.format(counter, len(names),
                                                      name))
    counter += 1

    #if counter == 5:
    #    break

data_store_columns = [
    'name', 'first name', 'middle name', 'last name', 'suffix', 'repecshortID',
    'email', 'homepage', 'postal address', 'phone', 'twitterhandle',
    'affiliation', 'locationdata'
]

write_excel = create_excel_file(
    './results/{}_results.xlsx'.format('IDEASaffiliation'))
wb = openpyxl.load_workbook(write_excel)
ws = wb[wb.sheetnames[-1]]
print_df_to_excel(df=pd.DataFrame(data=personaldata,
                                  columns=data_store_columns),
                  ws=ws)
wb.save(write_excel)

# store in pkl for future retrieve (no need to rerun code again)
with open('IDEASaffiliationSecondVariant.pkl', 'wb') as handle:
    pickle.dump([data_store_columns, personaldata],
                handle,
                protocol=pickle.HIGHEST_PROTOCOL)

elapsed = (time.time() - start) / 3600
print(f"Elapsed time: {elapsed} hours")
# twitterscraper from:JustinWolfers -bd 2018-07-01 -ed 2019-07-01 --output=JustinWolferspart8.json  ----error 1114
# twitterscraper from:JustinWolfers -bd 2019-07-01 -ed 2020-05-19 --output=JustinWolferspart9.json  ----error 1383
# twitterscraper from:JustinWolfers -bd 2011-07-01 -ed 2012-07-01 --output=JustinWolferspart1.json && twitterscraper from:JustinWolfers -bd 2012-07-01 -ed 2013-07-01 --output=JustinWolferspart2.json && twitterscraper from:JustinWolfers -bd 2013-07-01 -ed 2014-07-01 --output=JustinWolferspart3.json && twitterscraper from:JustinWolfers -bd 2014-07-01 -ed 2015-07-01 --output=JustinWolferspart4.json && twitterscraper from:JustinWolfers -bd 2015-07-01 -ed 2016-07-01 --output=JustinWolferspart5.json && twitterscraper from:JustinWolfers -bd 2016-07-01 -ed 2017-07-01 --output=JustinWolferspart6.json && twitterscraper from:JustinWolfers -bd 2017-07-01 -ed 2018-07-01 --output=JustinWolferspart7.json && twitterscraper from:JustinWolfers -bd 2018-07-01 -ed 2019-07-01 --output=JustinWolferspart8.json && twitterscraper from:JustinWolfers -bd 2019-07-01 -ed 2020-05-19 --output=JustinWolferspart9.json

import codecs, json
import pandas as pd
import openpyxl
from others import create_excel_file, print_df_to_excel

parts = 9

with codecs.open('JustinWolferspart{}.json'.format(1), 'r', 'utf-8') as f:
    tweets = json.load(f, encoding='utf-8')
df = pd.read_json('JustinWolferspart{}.json'.format(1), encoding='utf-8')
print(df)

for partnumber in range(2, parts + 1, 1):
    with codecs.open('JustinWolferspart{}.json'.format(partnumber), 'r',
                     'utf-8') as f:
        tweets = json.load(f, encoding='utf-8')
    dfread = pd.read_json('JustinWolferspart{}.json'.format(partnumber),
                          encoding='utf-8')
    print(dfread)
    df = df.append(dfread)

name = 'JustinWolfers'
write_excel = create_excel_file('./results/{}_results.xlsx'.format(name))
wb = openpyxl.load_workbook(write_excel)
ws = wb[wb.sheetnames[-1]]
print_df_to_excel(df=df, ws=ws)
wb.save(write_excel)