def readfile(name): with open('./data_store.pkl', 'rb') as handle: data_store = pickle.load(handle) write_excel = create_excel_file('./results/{}_results.xlsx'.format(name)) wb = openpyxl.load_workbook(write_excel) ws = wb[wb.sheetnames[-1]] print_df_to_excel(df=pd.DataFrame(data=data_store[1], columns=data_store[0]), ws=ws) wb.save(write_excel)
pickle.dump([data_store_columns, countrydataoverall], handle, protocol=pickle.HIGHEST_PROTOCOL) with open('IDEAScitystatedata{}.pkl'.format(indextostart), 'wb') as handle: pickle.dump([data_store_columns2, citystatedataoverall], handle, protocol=pickle.HIGHEST_PROTOCOL) print('Progress: {} out of {} done'.format(authors + indextostart, len(delimitedlocation))) #if authors > 2: # break write_excel = create_excel_file( './results/{}_results.xlsx'.format('IDEAScountrydata')) wb = openpyxl.load_workbook(write_excel) ws = wb[wb.sheetnames[-1]] print_df_to_excel(df=pd.DataFrame(data=countrydataoverall, columns=data_store_columns), ws=ws) wb.save(write_excel) write_excel = create_excel_file( './results/{}_results.xlsx'.format('IDEAScitystatedata')) wb = openpyxl.load_workbook(write_excel) ws = wb[wb.sheetnames[-1]] print_df_to_excel(df=pd.DataFrame(data=citystatedataoverall, columns=data_store_columns2), ws=ws) wb.save(write_excel)
title = 'NA' email = 'NA' personaldetails.append(name) personaldetails.append(title) personaldetails.append(email) personaldata.append(personaldetails) with open('GSaffiliationscrap{}.pkl'.format(indextostart), 'wb') as handle: pickle.dump([data_store_columns, personaldata], handle, protocol=pickle.HIGHEST_PROTOCOL) print('Progress: {} out of {} for {} done'.format(authors + indextostart, numberofauthors, name)) #if authors > 3: # break write_excel = create_excel_file( './results/{}_results.xlsx'.format('GSAffiliationScrap')) wb = openpyxl.load_workbook(write_excel) ws = wb[wb.sheetnames[-1]] print_df_to_excel(df=pd.DataFrame(data=personaldata, columns=data_store_columns), ws=ws) wb.save(write_excel) elapsed = (time.time() - start) / 3600 print(f"Elapsed time: {elapsed} hours")
title = affiliationtable.text # print(title) verifiedemail = soup.find('div', attrs={'id': 'gsc_prf_ivh'}) email = verifiedemail.text personaldetails.append(title) personaldetails.append(email) personaldata.append(personaldetails) with open('GSauthorduplicate{}.pkl'.format(indextostart), 'wb') as handle: pickle.dump([data_store_columns, personaldata], handle, protocol=pickle.HIGHEST_PROTOCOL) print('Progress: {} out of {} for {} done'.format(authors + indextostart, numberofauthors, name)) #if authors > 3: # break write_excel = create_excel_file( './results/{}_results.xlsx'.format('GSauthorduplicate')) wb = openpyxl.load_workbook(write_excel) ws = wb[wb.sheetnames[-1]] print_df_to_excel(df=pd.DataFrame(data=personaldata, columns=data_store_columns), ws=ws) wb.save(write_excel) elapsed = (time.time() - start) / 3600 print(f"Elapsed time: {elapsed} hours")
personaldata = personaldata.append(df7a) personaldata = personaldata.append(df8) personaldata = personaldata.append(df9) personaldata = personaldata.append(df10) personaldata = personaldata.append(df11) data_store_columns = ['Name', 'Title', 'Email'] #data_store_columns = ['Name', 'Total Citations', 'Total Citations (5 years)', 'h-index', 'h-index (5 years)', # 'i10-index', 'i10-index (5 years)', 'Journal Authors', 'Journal Titles', 'Journal Names', # 'Journal Years', 'Number of publications', 'Probability of correct profile'] #data_store_columns = ['Name', 'Total Citations', 'Total Citations (5 years)', 'h-index', 'h-index (5 years)', # 'i10-index', 'i10-index (5 years)', 'Journal Authors', 'Journal Titles', 'Journal Names', # 'Journal Years', 'Number of publications', 'Probability of correct profile', 'Keywords match', # 'Title', 'Email'] with open('GSaffiliationscrapComplete.pkl', 'wb') as handle: pickle.dump([data_store_columns, personaldata.values], handle, protocol=pickle.HIGHEST_PROTOCOL) write_excel = create_excel_file( './results/{}_results.xlsx'.format('GSaffiliationscrapComplete')) wb = openpyxl.load_workbook(write_excel) ws = wb[wb.sheetnames[-1]] print_df_to_excel(df=pd.DataFrame(data=personaldata, columns=data_store_columns), ws=ws) wb.save(write_excel)
personaldata.append(personaldetails) with open('GSdatascrap{}.pkl'.format(indextostart), 'wb') as handle: pickle.dump([data_store_columns, personaldata], handle, protocol=pickle.HIGHEST_PROTOCOL) with open('authorduplicate{}.pkl'.format(indextostart), 'wb') as handle: pickle.dump( [['Authors with more than 1 profile in GS'], authorsduplicate], handle, protocol=pickle.HIGHEST_PROTOCOL) print('Progress: {} out of {} for {} done'.format(authors + indextostart, numberofauthors, name)) # if authors > 3: # break write_excel = create_excel_file( './results/{}_results.xlsx'.format('GSWebScrap')) wb = openpyxl.load_workbook(write_excel) ws = wb[wb.sheetnames[-1]] print_df_to_excel(df=pd.DataFrame(data=personaldata, columns=data_store_columns), ws=ws) wb.save(write_excel) elapsed = (time.time() - start) / 3600 print(f"Elapsed time: {elapsed} hours")
def pso_ga(func, pmin, pmax, smin, smax, int_idx, params, ga, dv): # Setting params c1, c2, wmin, wmax, ga_iter_min, ga_iter_max, iter_gamma, ga_num_min, ga_num_max, num_beta,\ tourn_size, cxpb, mutpb, indpd, eta,\ pso_iter, swarm_size = \ params['c1'], params['c2'], params['wmin'], params['wmax'],\ params['ga_iter_min'], params['ga_iter_max'], params['iter_gamma'],\ params['ga_num_min'], params['ga_num_max'], params['num_beta'],\ params['tourn_size'], params['cxpd'], params['mutpd'], params['indpd'], params['eta'],\ params['pso_iter'], params['swarm_size'] # int_idx must be a list. If a single number is given, convert to list. if isinstance(int_idx,int): int_idx = [int_idx] creator.create("FitnessMin", base.Fitness, weights=(-1.0,)) # Minimization of a single scalar value creator.create("Particle", list, fitness=creator.FitnessMin, speed=list, smin=None, smax=None, best=None, int_idx=None) toolbox = base.Toolbox() toolbox.register("particle", generate_part, dim=len(pmin), pmin=pmin, pmax=pmax, smin=smin, smax=smax, int_idx=int_idx) toolbox.register("population", tools.initRepeat, list, toolbox.particle) toolbox.register("update", updateParticle, c1=c1, c2=c2) toolbox.register("evaluate", func) toolbox.register("mate", tools.cxTwoPoint) #toolbox.register("mutate", ga_hybrid_polymutate, low=pmin, up=pmax, indpb=indpd, eta=eta) toolbox.register("mutate", ga_hybrid_gaussianmutate, low=pmin, up=pmax, indpb=indpd, sigma=smax) pop = toolbox.population(n=swarm_size) stats = tools.Statistics(lambda ind: ind.fitness.values) stats.register("avg", np.mean) stats.register("std", np.std) stats.register("min", np.min) stats.register("max", np.max) logbook = tools.Logbook() logbook.header = ["gen", "evals"] + stats.fields best = None pso_hof_num = max(1,round(ga_num_min*0.2)) pso_hof = tools.HallOfFame(pso_hof_num) for g in range(pso_iter): # PSO segment first for part in pop: part.fitness.values = toolbox.evaluate(part) # Note: Fitness comparisons will compare the weighted value. Since weight is negative, # the comparison would be opposite unless you specify .values instead. if not part.best or part.best.fitness.values[0] > part.fitness.values[0]: part.best = creator.Particle(part) part.best.fitness.values = part.fitness.values if not best or best.fitness.values[0] > part.fitness.values[0]: best = creator.Particle(part) best.fitness.values = part.fitness.values for part in pop: # Linear annealing for inertia velocity coefficient (the w weights) toolbox.update(part, best=best, w=wmax - (wmax-wmin)*(g/pso_iter)**iter_gamma) if ga: # GA segment # Start at max and approach min ga_pop = round(ga_num_min + (g/pso_iter)**iter_gamma*(ga_num_max-ga_iter_min)) ga_gen = round(ga_iter_min + (g/pso_iter)**num_beta*(ga_iter_max-ga_iter_min)) if len(pso_hof) == 0: ga_mask = [1 for _ in range(ga_pop)] + [0 for _ in range(swarm_size-ga_pop)] random.shuffle(ga_mask) population = [x for x,mask in zip(pop, ga_mask) if mask == 1] else: ga_pop += - pso_hof_num ga_mask = [1 for _ in range(ga_pop)] + [0 for _ in range(swarm_size - ga_pop)] random.shuffle(ga_mask) population = [x for x, mask in zip(pop, ga_mask) if mask == 1] + pso_hof.items halloffame = tools.HallOfFame(ga_pop) halloffame.update(population) ga_eval = 0 # Begin the generational process for gen in range(ga_gen): # Select the next generation individuals. Built in tournament selector does not work for multi-objective # offspring = toolbox.select(population, len(population)) # Own selection using tournment. Will work for multi-objective. chosen = [] for i in range(ga_pop): aspirants = selRandom(population, tourn_size) scores = [x.fitness.values[0] for x in aspirants] f = lambda i: scores[i] chosen_idx = min(range(len(scores)), key=f) chosen.append(aspirants[chosen_idx]) pass offspring = chosen # Vary the pool of individuals offspring = varAnd(offspring, toolbox, cxpb, mutpb) # Evaluate the individuals with an invalid fitness invalid_ind = [ind for ind in offspring if not ind.fitness.valid] ga_eval += len(invalid_ind) fitnesses = toolbox.map(toolbox.evaluate, invalid_ind) for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = fit # Update the hall of fame with the generated individuals halloffame.update(offspring) # Replace the current population by the offspring population[:] = offspring counter = 0 if best.fitness.values[0] > halloffame[0].fitness.values[0]: best = creator.Particle(halloffame[0]) best.fitness.values = halloffame[0].fitness.values for idx, mask in enumerate(ga_mask): if mask == 1: try: if pop[idx].fitness.values[0] > halloffame[counter].fitness.values[0]: pop[idx] = halloffame[counter] pop[idx].best = creator.Particle(part) pop[idx].best.fitness.values = halloffame[counter].fitness.values counter += 1 except IndexError: break pso_hof.update(pop) # Gather all the fitnesses in one list and print the stats try: logbook.record(gen=g, evals=len(pop) + ga_eval, **stats.compile(pop)) except UnboundLocalError: # Means ga=False and ga_eval is not assigned logbook.record(gen=g, evals=len(pop), **stats.compile(pop)) #print(best) print(logbook.stream) print(best.fitness.values) # Printing to excel write_excel = create_excel_file('./results/pso_ga_results.xlsx') wb = openpyxl.load_workbook(write_excel) ws = wb[wb.sheetnames[-1]] ws.cell(1, 1).value = 'Optimal Decision Values' print_array_to_excel(dv, (2, 1), ws=ws, axis=1) print_array_to_excel(best, (3, 1), ws=ws, axis=1) genfit = logbook.select("gen") avgfit = logbook.select("avg") stdfit = logbook.select("std") minfit = logbook.select("min") maxfit = logbook.select("max") ws.cell(5, 1).value = 'gen' ws.cell(6, 1).value = 'avg' ws.cell(7, 1).value = 'std' ws.cell(8, 1).value = 'min' ws.cell(9, 1).value = 'max' print_array_to_excel(genfit, (5, 2), ws=ws, axis=1) print_array_to_excel(avgfit, (6, 2), ws=ws, axis=1) print_array_to_excel(stdfit, (7, 2), ws=ws, axis=1) print_array_to_excel(minfit, (8, 2), ws=ws, axis=1) print_array_to_excel(maxfit, (9, 2), ws=ws, axis=1) wb.save(write_excel) return pop, logbook, best
if np.isnan(GStitles[i]): state1 = 'NA' state2 = 'NA' country = 'NA' except: geocode_result = gmaps.geocode(GStitles[i]) try: state1, state2, country = getstatecountry(geocode_result) except: state1 = 'Cannot retrieve' state2 = 'Cannot retrieve' country = 'Cannot retrieve' state1data.append(state1) state2data.append(state2) countrydata.append(country) #if i == 10: # break data = {'state1': state1data, 'state2': state2data, 'country': countrydata} write_excel = create_excel_file( './results/{}_results.xlsx'.format('GScitystatedata')) wb = openpyxl.load_workbook(write_excel) ws = wb[wb.sheetnames[-1]] print_df_to_excel(df=pd.DataFrame(data), ws=ws) wb.save(write_excel) elapsed = (time.time() - start) / 3600 print(f"Elapsed time: {elapsed} hours")
personaldata.append(personaldetails) print('Progress: {} out of {} for {} done'.format(counter, len(names), name)) counter += 1 #if counter == 5: # break data_store_columns = [ 'name', 'first name', 'middle name', 'last name', 'suffix', 'repecshortID', 'email', 'homepage', 'postal address', 'phone', 'twitterhandle', 'affiliation', 'locationdata' ] write_excel = create_excel_file( './results/{}_results.xlsx'.format('IDEASaffiliation')) wb = openpyxl.load_workbook(write_excel) ws = wb[wb.sheetnames[-1]] print_df_to_excel(df=pd.DataFrame(data=personaldata, columns=data_store_columns), ws=ws) wb.save(write_excel) # store in pkl for future retrieve (no need to rerun code again) with open('IDEASaffiliationSecondVariant.pkl', 'wb') as handle: pickle.dump([data_store_columns, personaldata], handle, protocol=pickle.HIGHEST_PROTOCOL) elapsed = (time.time() - start) / 3600 print(f"Elapsed time: {elapsed} hours")
# twitterscraper from:JustinWolfers -bd 2018-07-01 -ed 2019-07-01 --output=JustinWolferspart8.json ----error 1114 # twitterscraper from:JustinWolfers -bd 2019-07-01 -ed 2020-05-19 --output=JustinWolferspart9.json ----error 1383 # twitterscraper from:JustinWolfers -bd 2011-07-01 -ed 2012-07-01 --output=JustinWolferspart1.json && twitterscraper from:JustinWolfers -bd 2012-07-01 -ed 2013-07-01 --output=JustinWolferspart2.json && twitterscraper from:JustinWolfers -bd 2013-07-01 -ed 2014-07-01 --output=JustinWolferspart3.json && twitterscraper from:JustinWolfers -bd 2014-07-01 -ed 2015-07-01 --output=JustinWolferspart4.json && twitterscraper from:JustinWolfers -bd 2015-07-01 -ed 2016-07-01 --output=JustinWolferspart5.json && twitterscraper from:JustinWolfers -bd 2016-07-01 -ed 2017-07-01 --output=JustinWolferspart6.json && twitterscraper from:JustinWolfers -bd 2017-07-01 -ed 2018-07-01 --output=JustinWolferspart7.json && twitterscraper from:JustinWolfers -bd 2018-07-01 -ed 2019-07-01 --output=JustinWolferspart8.json && twitterscraper from:JustinWolfers -bd 2019-07-01 -ed 2020-05-19 --output=JustinWolferspart9.json import codecs, json import pandas as pd import openpyxl from others import create_excel_file, print_df_to_excel parts = 9 with codecs.open('JustinWolferspart{}.json'.format(1), 'r', 'utf-8') as f: tweets = json.load(f, encoding='utf-8') df = pd.read_json('JustinWolferspart{}.json'.format(1), encoding='utf-8') print(df) for partnumber in range(2, parts + 1, 1): with codecs.open('JustinWolferspart{}.json'.format(partnumber), 'r', 'utf-8') as f: tweets = json.load(f, encoding='utf-8') dfread = pd.read_json('JustinWolferspart{}.json'.format(partnumber), encoding='utf-8') print(dfread) df = df.append(dfread) name = 'JustinWolfers' write_excel = create_excel_file('./results/{}_results.xlsx'.format(name)) wb = openpyxl.load_workbook(write_excel) ws = wb[wb.sheetnames[-1]] print_df_to_excel(df=df, ws=ws) wb.save(write_excel)