def cleanPublicationCSV(filename): publications = {} with open(filename, encoding='utf-8') as document: reader = csv.reader(document, delimiter=",") next(reader) for row in reader: if not row[0] in publications: publications[row[0]] = row[1] else: if row[1] != publications[row[0]]: publications[row[0]] = min(row[1], publications[row[0]]) if not os.path.exists('./data/tmp'): os.makedirs('./data/tmp') open('./data/tmp/BACKUP_PUBLICATION_DATES.csv', 'a').close() shutil.copyfile(filename, './data/tmp/BACKUP_PUBLICATION_DATES.csv') os.remove(filename) try: #patch to add header to the publication_dates file asn.createCSV({}, filename, ["doi","date"], 0) asn.createPublicationDatesCSV(publications, filename) except: log = open('./data/tmp/log.txt', 'a') log.write('Error while refactoring PUBLICATION_DATES.csv\n') log.close() open(filename, 'a').close() shutil.copyfile('./data/tmp/BACKUP_PUBLICATION_DATES.csv', filename) os.remove('./data/tmp/BACKUP_PUBLICATION_DATES.csv') finally: os.remove('./data/tmp/BACKUP_PUBLICATION_DATES.csv')
def cleanCandidatesCSV(filename): candidates = {} candidatesByName = {} with open(filename, encoding='utf-8') as document: reader = csv.reader(document, delimiter=",") next(reader) candidatesIndex = 0 for row in reader: if row[4] in candidatesByName: if row[1] != candidatesByName[row[4]]['session'] or row[2] != candidatesByName[row[4]]['level']: candidates[candidatesIndex] = { 'name': row[0], 'session': row[1], 'level': row[2], 'subject': row[3], 'id': row[4], 'journal_dois': row[5], 'dois': row[6], 'real_articles': row[7], 'real_citations': row[8], 'real_hindex': row[9], 'threshold_articles': row[10], 'threshold_citations': row[11], 'threshold_hindex': row[12]} candidatesIndex = candidatesIndex + 1 else: candidatesByName[row[4]] = { 'name': row[0], 'session': row[1], 'level': row[2], 'subject': row[3], 'id': row[4], 'journal_dois': row[5], 'dois': row[6], 'real_articles': row[7], 'real_citations': row[8], 'real_hindex': row[9], 'threshold_articles': row[10], 'threshold_citations': row[11], 'threshold_hindex': row[12]} candidates[candidatesIndex] = { 'name': row[0], 'session': row[1], 'level': row[2], 'subject': row[3], 'id': row[4], 'journal_dois': row[5], 'dois': row[6], 'real_articles': row[7], 'real_citations': row[8], 'real_hindex': row[9], 'threshold_articles': row[10], 'threshold_citations': row[11], 'threshold_hindex': row[12]} candidatesIndex = candidatesIndex + 1 if not os.path.exists('./data/tmp'): os.makedirs('./data/tmp') open('./data/tmp/BACKUP_CANDIDATES_OUT.csv', 'a').close() shutil.copyfile(filename, './data/tmp/BACKUP_CANDIDATES_OUT.csv') os.remove(filename) try: asn.createCSV(candidates, filename, ['name', 'session', 'level', 'subject', 'id', 'journal_dois', 'dois', 'real_articles', 'real_citations', 'real_hindex', 'threshold_articles', 'threshold_citations', 'threshold_hindex'], 0) except: log = open('./data/tmp/log.txt', 'a') log.write('Error while refactoring CANDIDATES_OUT\n') log.close() open(filename, 'a').close() os.remove('./data/tmp/BACKUP_CANDIDATES_OUT.csv') finally: os.remove('./data/tmp/BACKUP_CANDIDATES_OUT.csv')
if not fileFound: print(CANDIDATES_OUT, ' OR ', CITATIONS_OUT, ' OR ', PUBLICATION_DATES, ' NOT FOUND') else: ('CALCULATING INDEXES') if asn.checkFileIsPresent(CROSS_DATA): os.remove(CROSS_DATA) candidates = asn.createDict(CANDIDATES_OUT) citations = asn.createSimpleDict(CITATIONS_OUT) publicationDates = asn.createSimpleDict(PUBLICATION_DATES) crossData = asn.crossData(candidates, citations, publicationDates) candidates = {} citations = {} asn.createCSV(crossData, CROSS_DATA, [ 'name', 'session', 'level', 'subject', 'id', 'articles', 'citations', 'hindex', 'real_articles', 'real_citations', 'real_hindex', 'threshold_articles', 'threshold_citations', 'threshold_hindex' ], 0) elif choice == 4: results = asn.analizeResults(CROSS_DATA) subjectsFinal = [] doAll = True candidatesZero = 0 resultsAll = { 1: { 'overall': 0, 'articles': 0, 'citations': 0, 'hindex': 0, }, 2: {
def formatData(filename, calculatedRows, candidatesCSV, publicationDatesCSV, citationsCSV): candidates = {} with open(filename, encoding='utf-8') as document: reader = csv.reader(document, dialect='excel-tab') next(reader) # VENGONO SALTATE LE RIGHE FINO A RAGGIUNGERE L'ULTIMA RIGA ELABORATA NELLA PRECEDENTE RUN for _ in range(calculatedRows): next(reader) candidateIndex = 0 doneRows = calculatedRows + 1 for row in reader: if row[8] != '' and row[13] != '': session = row[0] level = row[1] subject = row[2] candidateId = row[4] dois = row[6] realData = { "articles": row[8], "citations": row[9], "hindex": row[10] } threshold = { "articles": row[13], "citations": row[14], "hindex": row[15] } journalDois = [] doisArray = ast.literal_eval(dois) doisArray = set(doisArray) # ELIMINA RIPETIZIONI publicationDates = {} dois = [] for doi in doisArray: dois.append(doi.lower()) results = [] authors = {} authorsIndex = 0 with Pool(processes=8) as pool: results = pool.map(checkDoiJournalArticle, doisArray) for elem in results: journal = elem[0] publicationDate = elem[1] doi = elem[2].lower() author = elem[3] if journal != "": journalDois.append(journal.lower()) if publicationDate != 0 and publicationDate != 9999: publicationDates[doi] = publicationDate authors[authorsIndex] = author authorsIndex = authorsIndex + 1 candidateName = findCandidateName(authors) dblp = checkAuthorDBLP(candidateName) for doi in doisArray: doi = doi.lower() if doi in dblp: if dblp[doi]['journal'] == True: if not doi in journalDois: journalDois.append(doi) if not doi in publicationDates: publicationDates[doi] = dblp[doi]['date'] if len(journalDois) > 0 or len(doisArray) > 0: candidates[candidateIndex] = { 'name': candidateName, 'session': session, 'level': level, 'subject': subject, 'id': candidateId, 'journal_dois': journalDois, 'dois': dois, 'real_articles': realData['articles'], 'real_citations': realData['citations'], 'real_hindex': realData['hindex'], 'threshold_articles': threshold['articles'], 'threshold_citations': threshold['citations'], 'threshold_hindex': threshold['hindex']} candidateIndex = candidateIndex + 1 asn.createCSV(candidates, candidatesCSV, ['name', 'session', 'level', 'subject', 'id', 'journal_dois', 'dois', 'real_articles', 'real_citations', 'real_hindex', 'threshold_articles', 'threshold_citations', 'threshold_hindex'], calculatedRows) # SCRITTURA SUL CSV DEI CANDIDATI if len(publicationDates) > 0: asn.createPublicationDatesCSV( publicationDates, publicationDatesCSV) candidates = {} log = open('./data/tmp/log.txt', 'a') log.write('END ROW ' + str(doneRows) + '\n') log.close() doneRows = doneRows + 1 calculatedRows = calculatedRows + 1 cleanCandidatesCSV(candidatesCSV) cleanPublicationCSV(publicationDatesCSV)
CROSS_DATA = configurations.CROSS_DATA REAL_DATA = configurations.REAL_DATA PUBLICATION_DATES = configurations.PUBLICATION_DATES SUBJECTS = configurations.SUBJECTS startTime = datetime.now() if __name__ == '__main__': freeze_support() choice = asn.mainMenu() if choice == 1: if asn.checkFileIsPresent(CANDIDATES_IN): #patch to add header to the publication_dates file if not asn.checkFileIsPresent(PUBLICATION_DATES): asn.createCSV({}, PUBLICATION_DATES, ["doi", "date"], 0) calculatedRows = 0 log = open('./data/tmp/log.txt', 'a') log.write('GENERATING CANDIDATES\n') if asn.checkFileIsPresent(CANDIDATES_OUT): calculatedRows = asn.checkProcess(CANDIDATES_OUT) log.write('RESUMING FROM ROW ' + str(calculatedRows)) log.close() asn.formatData(CANDIDATES_IN, calculatedRows, CANDIDATES_OUT, PUBLICATION_DATES, CITATIONS_OUT) else: print(CANDIDATES_IN, ' NOT FOUND') log = open('./data/tmp/log.txt', 'a') log.write('CANDIDATES GENERATED\n') log.close()