def main(): print('Iniciando...') with open('../config.json') as f: config = json.load(f) publons_data_path = config['publons_data'] if not exists(publons_data_path): makedirs(publons_data_path) # Procura dados recuperados previamente data_recovered_files = get_files(publons_data_path, 'json') data_recovered = [] if data_recovered_files: for data_recovered_file in data_recovered_files: usp_id = data_recovered_file.split('_')[0] data_recovered.append(usp_id) print('Qtd. de dados recuperados previamente: ', len(data_recovered)) # Escrevendo arquivo com dados recuperados data_analysed_file = join(publons_data_path, 'data_analysed.txt') with open(data_analysed_file, 'w', newline='') as f: f.write(','.join(set(data_recovered))) # Deletando arquivo de dados faltantes data_missing_file = join(publons_data_path, 'data_missing.txt') with open(data_missing_file, 'w', newline='') as f: f.write('') print('Qtd. de dados no novo arquivo: ', len(data_recovered))
def main(): all_units = [ 30, 64, 86, 27, 39, 90, 7, 22, 88, 18, 3, 11, 16, 9, 60, 2, 89, 12, 81, 48, 59, 8, 5, 17, 10, 23, 25, 58, 6, 74, 93, 14, 41, 42, 55, 4, 31, 43, 76, 44, 45, 83, 47, 46, 75, 87, 21, 71, 32, 38, 33, 1 ] all_units.sort() with open('../config.json') as f: config = json.load(f) if not exists(config['depts']): makedirs(config['depts']) files = get_files(config['depts'], 'json') saved_units = [] if len(files) > 0: for file in files: unit = file.split('_').pop()[:-5] if unit not in saved_units: saved_units.append(int(unit)) saved_units.sort() else: print('Nenhum departamento recuperado previamente.') print('Iniciando...') units = list(set(all_units) - set(saved_units)) while len(units) > 0: units = get_depts(units, config['depts']) else: print('Dados dos departamentos recuperados.')
def main(): print('Iniciando...') wait_time = [3, 5, 7, 9] s = requests.Session() with open('../config.json') as f: config = json.load(f) if not exists(config['people']): print('Nenhum dado a ser recuperado.') return 0 files = get_files(config['people'], 'csv') if len(files) == 0: print('Nenhum dado a ser recuperado.') return 0 if not exists(config['publons_info']): makedirs(config['publons_info']) # Procura departamentos recuperados previamente depts_recovered_file = join(config['publons_info'], 'depts_recovered.txt') if exists(depts_recovered_file): with open(depts_recovered_file, 'r') as f: depts = f.read().split(',') print('Departamentos recuperados previamente: ', len(depts)) else: depts = [] publons_file = join(config['publons_info'], 'publons_info.csv') if exists(publons_file) == False: print('Novo arquivo criado.') with open(publons_file, 'w', newline='') as f: csv_writer = csv.writer(f, quoting=csv.QUOTE_NONE, escapechar='\\') csv_writer.writerow([ 'usp_id', 'usp_name', 'usp_unit', 'usp_dept', 'publons_id', 'publons_name' ]) i = 0 for file in files: file_name = file.split('_') unit = file_name[0] dept = file_name[1][:-4] unit_dept = unit + '_' + dept researchers_info = [] if unit_dept in depts: print('Departamento recuperado previamente: ', unit_dept, flush=True) continue else: print('Recuperando dados do departamento: ', unit_dept, flush=True) with open(join(config['people'], file), 'r', newline='', encoding='utf-8') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') next(csv_reader) for row in csv_reader: # O elemento 5 da row (linha) contém a url do web of science,que é usada para obtenção do id da API do Publons url_wos = row[5] # Pega o id para refazer a url usp_id = row[7] usp_name = remove_accent_mark( (row[9].lower().replace(' ', '-'))) if url_wos != '': if i == 4: i = 0 wait_time = [3, 5, 7, 9] time.sleep(wait_time[i]) i = i + 1 try: content = s.get(url_wos).text except requests.exceptions.ConnectionError: time.sleep(60) content = s.get(url_wos).text page = BeautifulSoup(content, 'lxml') if page.find('meta', attrs={'property': 'og:url'}) != None: publons_id = page.find( 'meta', attrs={'property': 'og:url'})['content'].split('/')[4] publons_name = page.find( 'meta', attrs={'property': 'og:url'})['content'].split('/')[5] researchers_info.append({ 'usp_id': usp_id, 'usp_name': usp_name, 'usp_unit': unit, 'usp_dept': dept, 'publons_id': publons_id, 'publons_name': publons_name }) else: researchers_info.append({ 'usp_id': usp_id, 'usp_name': usp_name, 'usp_unit': unit, 'usp_dept': dept, 'publons_id': 'missing_id', 'publons_name': '' }) else: researchers_info.append({ 'usp_id': usp_id, 'usp_name': usp_name, 'usp_unit': unit, 'usp_dept': dept, 'publons_id': 'missing_id', 'publons_name': '' }) if researchers_info: with open(publons_file, 'a', newline='') as f: csv_writer = csv.writer(f, quoting=csv.QUOTE_NONE, escapechar='\\') for researcher_info in researchers_info: csv_writer.writerow([ researcher_info['usp_id'], researcher_info['usp_name'], researcher_info['usp_unit'], researcher_info['usp_dept'], researcher_info['publons_id'], researcher_info['publons_name'] ]) print('Dados recuperados do departamento: ', unit_dept, flush=True) depts.append(unit_dept) with open(depts_recovered_file, 'w', newline='') as f: print('Escrevendo arquivo com departamento recuperado...') f.write(','.join(depts)) print('Departamentos recuperados: ', len(depts)) print('Fim')
def main(): print('Iniciando...') wait_time = [3, 5, 7, 9] i = 0 s = requests.Session() with open('../config.json') as f: config = json.load(f) publons_info_path = config['publons_info'] if not exists(publons_info_path): print('Nenhum dado a ser recuperado.') return 0 publons_file = join(publons_info_path, 'publons_info_unique_filtered.csv') if exists(publons_file) == False: print('Nenhum dado a ser recuperado.') return 0 publons_data_path = config['publons_data'] if not exists(publons_data_path): makedirs(publons_data_path) # Procura dados analisados previamente data_analysed_file = join(publons_data_path, 'data_analysed.txt') if exists(data_analysed_file): with open(data_analysed_file, 'r') as f: data_analysed = f.read().split(',') else: data_analysed = [] print('Qtd. de dados analisados previamente: ', len(data_analysed)) # Procura dados recuperados previamente data_recovered_files = get_files(publons_data_path, 'json') data_recovered = [] if data_recovered_files: for data_recovered_file in data_recovered_files: usp_id = data_recovered_file.split('_')[0] data_recovered.append(usp_id) print('Qtd. de dados recuperados previamente: ', len(data_recovered)) # Procura dados analisados previamente com problema data_missing_file = join(publons_data_path, 'data_missing.txt') if exists(data_missing_file): with open(data_missing_file, 'r') as f: data_missing = f.read().split(',') if len(data_missing) == 1 and data_missing.pop() == '': data_missing = [] else: data_missing = [] print('Qtd. de dados com problema previamente: ', len(data_missing)) count = 0 with open(publons_file, 'r', newline='') as f: csv_reader = csv.reader(f, delimiter=',') next(csv_reader) if count < 2000: for row in csv_reader: usp_id = row[0] publons_id = row[4] if usp_id in data_analysed: continue else: print('Recuperando dados de: ', usp_id + '_' + publons_id, flush=True) if i == 4: i = 0 time.sleep(wait_time[i]) i = i + 1 url = 'https://publons.com/researcher/api/' + publons_id + '/metrics/individualStats/' try: response = s.get(url) except requests.exceptions.ConnectionError: time.sleep(60) response = s.get(url) count = count + 1 if response: r = response.json() if 'ready' in r and len(r.keys()) == 1: print('Sem dados.') if usp_id not in data_missing: data_missing.append(usp_id) with open(data_missing_file, 'w', newline='') as f: print('Escrevendo arquivo sem resposta...') f.write(','.join(data_missing)) else: with open(join(publons_data_path, usp_id + '_' + publons_id + '.json'), 'w', encoding='utf-8') as f: json.dump(response.json(), f, ensure_ascii=False, indent=4) print('Dado recuperado com sucesso.') else: print('Sem dados.') if usp_id not in data_missing: data_missing.append(usp_id) with open(data_missing_file, 'w', newline='') as f: print('Escrevendo arquivo sem resposta...') f.write(','.join(data_missing)) if usp_id not in data_analysed: data_analysed.append(usp_id) # Escrevendo arquivo com dado recuperado with open(data_analysed_file, 'w', newline='') as f: f.write(','.join(data_analysed)) else: print( 'Limite atingido. Por favor, execute esse sript novamente em 24 horas.' ) print('------') print('Requisitados: ', count) print('Qtd. de dados analisados: ', len(data_analysed)) print('Qtd. de dados recuperados: ', len(data_analysed) - len(data_missing)) print('Sem dados: ', len(data_missing)) print('Fim')
def main(): print('Iniciando...') with open('../config.json') as f: config = json.load(f) # Verifica se os dados estão disponíveis publons_data_path = config['publons_data'] if not exists(publons_data_path): print('Nenhum dado a ser processado.') return 0 files = get_files(publons_data_path, 'json') if len(files) == 0: print('Nenhum dado a ser processado.') return 0 # Verifica se o path dos resultados foi criado publons_results_path = config['publons_results'] if not exists(publons_results_path): makedirs(publons_results_path) # Verifica dados recuperados previamente publons_result_file = join(publons_results_path, 'results_publons' + '.csv') heading = [ 'usp_id', 'wos_publications', 'citations', 'citations_per_item', 'citations_per_year', 'h_index' ] data_analysed = set() if not exists(publons_result_file): with open(publons_result_file, 'w', newline='') as f: csv_writer = csv.writer(f, quoting=csv.QUOTE_NONE, escapechar='\\') csv_writer.writerow(heading) else: with open(publons_result_file, 'r', newline='') as f: csv_reader = csv.reader(f, delimiter=',') next(csv_reader) for row in csv_reader: data_analysed.add(','.join(row)) print('Qtd. de dados recuperados previamente: ', len(data_analysed)) print('Processando dados') for file in files: publons_file = join(publons_data_path, file) usp_id = file.split('_')[0] # Lê arquivo Publons with open(publons_file, 'r', encoding='utf-8') as f: if f.read(2) != '[]' and f.read(2) != '': f.seek(0) data = json.load(f) else: print('Sem dados de usp_id: ', usp_id, flush=True) continue if 'numPublicationsInWos' not in data: data['numPublicationsInWos'] = '' if 'timesCited' not in data: data['timesCited'] = '' if 'hIndex' not in data: data['hIndex'] = '' if 'averagePerItem' not in data: data['averagePerItem'] = '' if 'averagePerYear' not in data: data['averagePerYear'] = '' information = ','.join([ usp_id, str(data['numPublicationsInWos']), str(data['timesCited']), str(data['averagePerItem']), str(data['averagePerYear']), str(data['hIndex']) ]) if information not in data_analysed: with open(publons_result_file, 'a', newline='') as f: csv_writer = csv.writer(f, quoting=csv.QUOTE_NONE, escapechar='\\') csv_writer.writerow([ usp_id, data['numPublicationsInWos'], data['timesCited'], data['averagePerItem'], data['averagePerYear'], data['hIndex'] ]) print('Nro de ids processados: ', len(files))