def cleanPublicationCSV(filename):
    publications = {}
    with open(filename, encoding='utf-8') as document:
        reader = csv.reader(document, delimiter=",")
        next(reader)
        for row in reader:
            if not row[0] in publications:
                publications[row[0]] = row[1]
            else:
                if row[1] != publications[row[0]]:
                    publications[row[0]] = min(row[1], publications[row[0]])
    if not os.path.exists('./data/tmp'):
        os.makedirs('./data/tmp')
    open('./data/tmp/BACKUP_PUBLICATION_DATES.csv', 'a').close()
    shutil.copyfile(filename, './data/tmp/BACKUP_PUBLICATION_DATES.csv')
    os.remove(filename)
    try:
        #patch to add header to the publication_dates file
        asn.createCSV({}, filename, ["doi","date"], 0)
        asn.createPublicationDatesCSV(publications, filename)
    except:
        log = open('./data/tmp/log.txt', 'a')
        log.write('Error while refactoring PUBLICATION_DATES.csv\n')
        log.close()
        open(filename, 'a').close()
        shutil.copyfile('./data/tmp/BACKUP_PUBLICATION_DATES.csv', filename)
        os.remove('./data/tmp/BACKUP_PUBLICATION_DATES.csv')
    finally:
        os.remove('./data/tmp/BACKUP_PUBLICATION_DATES.csv')
def cleanCandidatesCSV(filename):
    candidates = {}
    candidatesByName = {}
    with open(filename, encoding='utf-8') as document:
        reader = csv.reader(document, delimiter=",")
        next(reader)
        candidatesIndex = 0
        for row in reader:
            if row[4] in candidatesByName:
                if row[1] != candidatesByName[row[4]]['session'] or row[2] != candidatesByName[row[4]]['level']:
                    candidates[candidatesIndex] = {
                        'name': row[0], 'session': row[1], 'level': row[2], 'subject': row[3], 'id': row[4], 'journal_dois': row[5], 'dois': row[6], 'real_articles': row[7], 'real_citations': row[8], 'real_hindex': row[9], 'threshold_articles': row[10], 'threshold_citations': row[11], 'threshold_hindex': row[12]}
                    candidatesIndex = candidatesIndex + 1
            else:
                candidatesByName[row[4]] = {
                    'name': row[0], 'session': row[1], 'level': row[2], 'subject': row[3], 'id': row[4], 'journal_dois': row[5], 'dois': row[6], 'real_articles': row[7], 'real_citations': row[8], 'real_hindex': row[9], 'threshold_articles': row[10], 'threshold_citations': row[11], 'threshold_hindex': row[12]}
                candidates[candidatesIndex] = {
                    'name': row[0], 'session': row[1], 'level': row[2], 'subject': row[3], 'id': row[4], 'journal_dois': row[5], 'dois': row[6], 'real_articles': row[7], 'real_citations': row[8], 'real_hindex': row[9], 'threshold_articles': row[10], 'threshold_citations': row[11], 'threshold_hindex': row[12]}
                candidatesIndex = candidatesIndex + 1
    if not os.path.exists('./data/tmp'):
        os.makedirs('./data/tmp')
    open('./data/tmp/BACKUP_CANDIDATES_OUT.csv', 'a').close()
    shutil.copyfile(filename, './data/tmp/BACKUP_CANDIDATES_OUT.csv')
    os.remove(filename)
    try:
        asn.createCSV(candidates, filename, ['name', 'session', 'level', 'subject', 'id', 'journal_dois', 'dois', 'real_articles',
                                             'real_citations', 'real_hindex', 'threshold_articles', 'threshold_citations', 'threshold_hindex'], 0)
    except:
        log = open('./data/tmp/log.txt', 'a')
        log.write('Error while refactoring CANDIDATES_OUT\n')
        log.close()
        open(filename, 'a').close()
        os.remove('./data/tmp/BACKUP_CANDIDATES_OUT.csv')
    finally:
        os.remove('./data/tmp/BACKUP_CANDIDATES_OUT.csv')
Exemple #3
0
     if not fileFound:
         print(CANDIDATES_OUT, ' OR ', CITATIONS_OUT, ' OR ',
               PUBLICATION_DATES, ' NOT FOUND')
     else:
         ('CALCULATING INDEXES')
         if asn.checkFileIsPresent(CROSS_DATA):
             os.remove(CROSS_DATA)
         candidates = asn.createDict(CANDIDATES_OUT)
         citations = asn.createSimpleDict(CITATIONS_OUT)
         publicationDates = asn.createSimpleDict(PUBLICATION_DATES)
         crossData = asn.crossData(candidates, citations, publicationDates)
         candidates = {}
         citations = {}
         asn.createCSV(crossData, CROSS_DATA, [
             'name', 'session', 'level', 'subject', 'id', 'articles',
             'citations', 'hindex', 'real_articles', 'real_citations',
             'real_hindex', 'threshold_articles', 'threshold_citations',
             'threshold_hindex'
         ], 0)
 elif choice == 4:
     results = asn.analizeResults(CROSS_DATA)
     subjectsFinal = []
     doAll = True
     candidatesZero = 0
     resultsAll = {
         1: {
             'overall': 0,
             'articles': 0,
             'citations': 0,
             'hindex': 0,
         },
         2: {
def formatData(filename, calculatedRows, candidatesCSV, publicationDatesCSV, citationsCSV):
    candidates = {}
    with open(filename, encoding='utf-8') as document:
        reader = csv.reader(document, dialect='excel-tab')
        next(reader)
        # VENGONO SALTATE LE RIGHE FINO A RAGGIUNGERE L'ULTIMA RIGA ELABORATA NELLA PRECEDENTE RUN
        for _ in range(calculatedRows):
            next(reader)
        candidateIndex = 0
        doneRows = calculatedRows + 1
        for row in reader:
            if row[8] != '' and row[13] != '':
                session = row[0]
                level = row[1]
                subject = row[2]
                candidateId = row[4]
                dois = row[6]
                realData = {
                    "articles": row[8],
                    "citations": row[9],
                    "hindex": row[10]
                }
                threshold = {
                    "articles": row[13],
                    "citations": row[14],
                    "hindex": row[15]
                }
                journalDois = []
                doisArray = ast.literal_eval(dois)
                doisArray = set(doisArray)  # ELIMINA RIPETIZIONI
                publicationDates = {}
                dois = []
                for doi in doisArray:
                    dois.append(doi.lower())
                results = []
                authors = {}
                authorsIndex = 0
                with Pool(processes=8) as pool:
                    results = pool.map(checkDoiJournalArticle, doisArray)
                for elem in results:
                    journal = elem[0]
                    publicationDate = elem[1]
                    doi = elem[2].lower()
                    author = elem[3]
                    if journal != "":
                        journalDois.append(journal.lower())
                    if publicationDate != 0 and publicationDate != 9999:
                        publicationDates[doi] = publicationDate
                    authors[authorsIndex] = author
                    authorsIndex = authorsIndex + 1
                candidateName = findCandidateName(authors)
                dblp = checkAuthorDBLP(candidateName)
                for doi in doisArray:
                    doi = doi.lower()
                    if doi in dblp:
                        if dblp[doi]['journal'] == True:
                            if not doi in journalDois:
                                journalDois.append(doi)
                            if not doi in publicationDates:
                                publicationDates[doi] = dblp[doi]['date']
                if len(journalDois) > 0 or len(doisArray) > 0:
                    candidates[candidateIndex] = {
                        'name': candidateName, 'session': session, 'level': level, 'subject': subject, 'id': candidateId, 'journal_dois': journalDois, 'dois': dois, 'real_articles': realData['articles'], 'real_citations': realData['citations'], 'real_hindex': realData['hindex'], 'threshold_articles': threshold['articles'], 'threshold_citations': threshold['citations'], 'threshold_hindex': threshold['hindex']}
                    candidateIndex = candidateIndex + 1
                    asn.createCSV(candidates, candidatesCSV,
                                  ['name', 'session', 'level', 'subject', 'id', 'journal_dois', 'dois', 'real_articles', 'real_citations', 'real_hindex', 'threshold_articles', 'threshold_citations', 'threshold_hindex'], calculatedRows)  # SCRITTURA SUL CSV DEI CANDIDATI
                if len(publicationDates) > 0:
                    asn.createPublicationDatesCSV(
                        publicationDates, publicationDatesCSV)
                candidates = {}
            log = open('./data/tmp/log.txt', 'a')
            log.write('END ROW ' + str(doneRows) + '\n')
            log.close()
            doneRows = doneRows + 1
            calculatedRows = calculatedRows + 1
    cleanCandidatesCSV(candidatesCSV)
    cleanPublicationCSV(publicationDatesCSV)
Exemple #5
0
CROSS_DATA = configurations.CROSS_DATA
REAL_DATA = configurations.REAL_DATA
PUBLICATION_DATES = configurations.PUBLICATION_DATES
SUBJECTS = configurations.SUBJECTS

startTime = datetime.now()

if __name__ == '__main__':
    freeze_support()
    choice = asn.mainMenu()
    if choice == 1:
        if asn.checkFileIsPresent(CANDIDATES_IN):

            #patch to add header to the publication_dates file
            if not asn.checkFileIsPresent(PUBLICATION_DATES):
                asn.createCSV({}, PUBLICATION_DATES, ["doi", "date"], 0)

            calculatedRows = 0
            log = open('./data/tmp/log.txt', 'a')
            log.write('GENERATING CANDIDATES\n')
            if asn.checkFileIsPresent(CANDIDATES_OUT):
                calculatedRows = asn.checkProcess(CANDIDATES_OUT)
                log.write('RESUMING FROM ROW ' + str(calculatedRows))
            log.close()
            asn.formatData(CANDIDATES_IN, calculatedRows, CANDIDATES_OUT,
                           PUBLICATION_DATES, CITATIONS_OUT)
        else:
            print(CANDIDATES_IN, ' NOT FOUND')
        log = open('./data/tmp/log.txt', 'a')
        log.write('CANDIDATES GENERATED\n')
        log.close()