Example #1
0
def handle_article(_: Any, article: ArticleType) -> bool:  # type: ignore
    global counter
    global successes
    global failures
    global totalPapers
    counter += 1
    try:
        if counter % 10000 == 0:
            print(f"{counter} papers processed.")
        if "author" not in article:
            return True
        # Fix if there is just one author.
        authorList: List[str] = []
        if type(article["author"]) == list:
            authorList = article["author"]
        elif type(article["author"]) == str:
            authorList = [str(article["author"])]
        elif (type(article["author"]) is OrderedDict
              or type(article["author"]) is dict):
            authorList = [article["author"]["#text"]]  # type: ignore
        else:
            print("***Unknown record type, skipping.***")
            return True
        authorsOnPaper = len(authorList)
        foundOneInDict = False or args.all
        if not args.all:
            for authorName in authorList:
                if (type(authorName) is OrderedDict
                        or type(authorName) is dict):
                    aName = authorName["#text"]  # type: ignore
                else:
                    aName = authorName
                aName = aName.strip()
                if aName in facultydict or args.all:
                    foundOneInDict = True
                    break
                with contextlib.suppress(KeyError):
                    if aliasdict[aName] in facultydict:
                        foundOneInDict = True
                        break
                    if reversealiasdict[aName] in facultydict:
                        foundOneInDict = True
                        break
            if not foundOneInDict:
                return True
        if "booktitle" in article:
            confname = Conference(article["booktitle"])
        elif "journal" in article:
            confname = Conference(article["journal"])
        else:
            return True

        if args.conference not in confname:
            return True

        if confname not in confdict:
            return True

        volume = article.get("volume", "0")
        number = article.get("number", "0")
        url = article.get("url", "")
        year = int(article.get("year", "-1"))
        pages = ""

        areaname = confdict[confname]
        # Special handling for PACMPL
        if areaname == Area("pacmpl"):
            confname = Conference(article["number"])
            if confname in confdict:
                areaname = confdict[confname]
            else:
                return True
        elif confname == Conference("ACM Trans. Graph."):
            if year in TOG_SIGGRAPH_Volume:
                (vol, num) = TOG_SIGGRAPH_Volume[year]
                if (volume == str(vol)) and (number == str(num)):
                    confname = Conference("SIGGRAPH")
                    areaname = confdict[confname]
            if year in TOG_SIGGRAPH_Asia_Volume:
                (vol, num) = TOG_SIGGRAPH_Asia_Volume[year]
                if (volume == str(vol)) and (number == str(num)):
                    confname = Conference("SIGGRAPH Asia")
                    areaname = confdict[confname]
        elif confname == "IEEE Trans. Vis. Comput. Graph.":
            if year in TVCG_Vis_Volume:
                (vol, num) = TVCG_Vis_Volume[year]
                if (volume == str(vol)) and (number == str(num)):
                    areaname = Area("vis")
            if year in TVCG_VR_Volume:
                (vol, num) = TVCG_VR_Volume[year]
                if (volume == str(vol)) and (number == str(num)):
                    confname = Conference("VR")
                    areaname = Area("vr")

        if "title" in article:
            title = Title("")
            if (type(article["title"]) is OrderedDict
                    or type(article["title"]) is dict):
                title = Title(article["title"]["#text"])  # type: ignore
            else:
                title = Title(article["title"])

        if "pages" in article:
            pages = article["pages"]
            pageCount = pagecount(pages)
            startPage = startpage(pages)
        else:
            pageCount = -1
            startPage = -1
        successes += 1
    except TypeError:
        raise
    except BaseException:
        print(sys.exc_info()[0])
        failures += 1
        raise

    if countPaper(confname, year, volume, number, pages, startPage, pageCount,
                  url, title):
        totalPapers += 1
        for authorName in authorList:
            aName = ""
            if type(authorName) is OrderedDict or type(authorName) is dict:
                aName = authorName["#text"]  # type: ignore
            elif type(authorName) is str:
                aName = authorName
            realName = aliasdict.get(aName, aName)
            affiliation = ""
            if realName in facultydict:
                affiliation = facultydict[realName]
            elif realName in aliasdict:
                affiliation = facultydict[aliasdict[realName]]
            elif realName in reversealiasdict:
                affiliation = facultydict[reversealiasdict[realName]]
            facultydict[realName] = affiliation

            if (affiliation and
                (realName in facultydict or realName in aliasdict
                 or realName in reversealiasdict)) or args.all:
                log: LogType = {
                    "name": realName.encode("utf-8"),
                    "year": year,
                    "title": title.encode("utf-8"),
                    "conf": confname,
                    "area": areaname,
                    "institution": affiliation,
                    "numauthors": authorsOnPaper,
                    "volume": volume,
                    "number": number,
                    "startPage": startPage,
                    "pageCount": pageCount,
                }
                tmplist: List[LogType] = authlogs.get(realName, [])
                tmplist.append(log)
                authlogs[realName] = tmplist
                interestingauthors[realName] += 1
                authorscores[(realName, areaname, year)] += 1.0
                authorscoresAdjusted[(realName, areaname,
                                      year)] += 1.0 / authorsOnPaper
    return True
Example #2
0
def parseDBLP(facultydict):
    authlogs = {}
    interestingauthors = {}
    authorscores = {}
    authorscoresAdjusted = {}
    coauthors = {}
    papersWritten = {}
    counter = 0

    with open('dblp.xml', mode='r') as f:

        # with gzip.open('dblp.xml.gz') as f:

        oldnode = None

        for (event, node) in ElementTree.iterparse(f, events=['start', 'end']):
            if (oldnode is not None):
                oldnode.clear()
            oldnode = node

            foundArticle = False
            inRange = False
            authorsOnPaper = 0
            authorName = ""
            confname = ""
            year = -1
            pageCount = -1
            startPage = -1
            foundOneInDict = False
            number = 0
            volume = 0

            if (node.tag == 'inproceedings' or node.tag == 'article'):

                # First, check if this is one of the conferences we are looking for.

                for child in node:
                    if (child.tag == 'booktitle' or child.tag == 'journal'):
                        confname = child.text
                        if (confname in confdict):
                            foundArticle = True
                    if (child.tag == 'volume'):
                        volume = child.text
                    if (child.tag == 'number'):
                        number = child.text
                    if child.tag == 'year':
                        if child.text is not None:
                            year = int(child.text)
                    if child.tag == 'pages':
                        pageCount = pagecount(child.text)
                        startPage = startpage(child.text)
                    if child.tag == 'author':
                        authorName = child.text
                        if authorName is not None:
                            authorName = authorName.strip()
                            authorsOnPaper += 1
                            if authorName in facultydict:
                                foundOneInDict = True

                if (not foundArticle):
                    # Not one of our conferences.
                    continue

                areaname = confdict[confname]

                # Special handling for ISMB.
                if (confname == 'Bioinformatics'):
                    if ISMB_Bioinformatics.has_key(year):
                        (vol, num) = ISMB_Bioinformatics[year]
                        if (volume != str(vol)) or (number != str(num)):
                            continue
                    else:
                        continue

                # Special handling for ICSE.
                if ((confname == 'ICSE') or (confname == 'ICSE (1)')
                        or (confname == 'ICSE (2)')):
                    if ICSE_ShortPaperStart.has_key(year):
                        pageno = ICSE_ShortPaperStart[year]
                        if startPage >= pageno:
                            # Omit papers that start at or beyond this page,
                            # since they are "short papers" (regardless of their length).
                            continue

                # Check that dates are in the specified range.
                if ((year >= startyear) and (year <= endyear)):
                    inRange = True

                if year == -1:
                    # No year.
                    print "NO YEAR WAT", confname
                    continue

                tooFewPages = False
                if ((pageCount != -1) and (pageCount < pageCountThreshold)):
                    tooFewPages = True
                    exceptionConference = ((confname == 'SC')
                                           or (confname == 'SIGSOFT FSE')
                                           or (confname == 'PLDI') or
                                           (confname == 'ACM Trans. Graph.'))
                    if ((pageCount == 0) and exceptionConference):
                        tooFewPages = False

                    # SPECIAL CASE FOR conferences that have incorrect entries (as of 6/22/2016).
                    # Only skip papers with a very small paper count,
                    # but above 1. Why?
                    # DBLP has real papers with incorrect page counts
                    # - usually a truncated single page. -1 means no
                    # pages found at all => some problem with journal
                    # entries in DBLP.
                    # print "Skipping article with "+str(pageCount)+" pages."

                for child in node:
                    if child.tag == 'author':
                        authorName = child.text
                        # if authorName is not None:
                        # print authorName.encode('utf-8') + "," + areaname + "," + str(volume) + "," + str(number) + "," + str(year) + "," + str(pageCount) + "," + str(startPage) + "," + str(authorsOnPaper)

                if ((confname == 'ASE') and (pageCount <= 6)):
                    tooFewPages = True

                if (not inRange) or (not foundOneInDict) or tooFewPages:
                    continue

                # If we got here, we have a winner.

                for child in node:
                    if child.tag == 'author':
                        authorName = child.text
                        authorName = authorName.strip()
                        if authorName in facultydict:
                            # print "here we go",authorName, confname, authorsOnPaper, year
                            logstring = authorName.encode(
                                'utf-8') + " ; " + confname + " " + str(year)
                            tmplist = authlogs.get(authorName, [])
                            tmplist.append(logstring)
                            authlogs[authorName] = tmplist
                            interestingauthors[
                                authorName] = interestingauthors.get(
                                    authorName, 0) + 1
                            authorscores[(
                                authorName, areaname,
                                year)] = authorscores.get(
                                    (authorName, areaname, year), 0) + 1.0
                            authorscoresAdjusted[(
                                authorName, areaname,
                                year)] = authorscoresAdjusted.get(
                                    (authorName, areaname, year),
                                    0) + 1.0 / authorsOnPaper

    return (interestingauthors, authorscores, authorscoresAdjusted, authlogs)
Example #3
0
def handle_article(_ : Any, article : ArticleType) -> bool: # type: ignore
    global counter
    global successes
    global failures
    global totalPapers
    counter += 1
    try:
        if counter % 10000 == 0:
            print(str(counter)+ " papers processed.")
        if not 'author' in article:
            return True
        # Fix if there is just one author.
        authorList : List[str] = []
        if type(article['author']) == list:
            authorList = article['author']
        else:
            if type(article['author']) == str:
                authorList = [str(article['author'])]
            elif type(article['author']) is collections.OrderedDict:
                authorList = [article['author']["#text"]] # type: ignore
            else:
                print("***Unknown record type, skipping.***")
                return True
        authorsOnPaper = len(authorList)
        foundOneInDict = False or args.all
        if not args.all:
            for authorName in authorList:
                if type(authorName) is collections.OrderedDict:
                    aName = authorName["#text"] # type: ignore
                else:
                    aName = authorName
                aName = aName.strip()
                if aName in facultydict or args.all:
                    foundOneInDict = True
                    break
                try:
                    if aliasdict[aName] in facultydict:
                        foundOneInDict = True
                        break
                    if reversealiasdict[aName] in facultydict:
                        foundOneInDict = True
                        break
                except:
                    pass
                        
            if not foundOneInDict:
                return True
        if 'booktitle' in article:
            confname = Conference(article['booktitle'])
        elif 'journal' in article:
            confname = Conference(article['journal'])
        else:
            return True

        if not args.conference in confname:
            return True
        
        if not confname in confdict:
            return True
        
        volume = article.get('volume',"0")
        number = article.get('number',"0")
        url    = article.get('url',"")
        year   = int(article.get('year',"-1"))
        pages  = ""
        
        areaname = confdict[confname]
        #Special handling for PACMPL
        if areaname == Area('pacmpl'):
            confname = Conference(article['number'])
            if confname in confdict:
                areaname = confdict[confname]
            else:
                return True
        elif confname == Conference('ACM Trans. Graph.'):
            if year in TOG_SIGGRAPH_Volume:
                (vol, num) = TOG_SIGGRAPH_Volume[year]
                if (volume == str(vol)) and (number == str(num)):
                    confname = Conference('SIGGRAPH')
                    areaname = confdict[confname]
            if year in TOG_SIGGRAPH_Asia_Volume:
                (vol, num) = TOG_SIGGRAPH_Asia_Volume[year]
                if (volume == str(vol)) and (number == str(num)):
                    confname = Conference('SIGGRAPH Asia')
                    areaname = confdict[confname]
        elif confname == 'IEEE Trans. Vis. Comput. Graph.':
            if year in TVCG_Vis_Volume:
                (vol, num) = TVCG_Vis_Volume[year]
                if (volume == str(vol)) and (number == str(num)):
                    areaname = Area('vis')
            if year in TVCG_VR_Volume:
                (vol, num) = TVCG_VR_Volume[year]
                if (volume == str(vol)) and (number == str(num)):
                    confname = Conference('VR')
                    areaname = Area('vr')

        if 'title' in article:
            title = Title("")
            if type(article['title']) is collections.OrderedDict:
                title = Title(article['title']["#text"]) # type: ignore
            else:
                title = Title(article['title'])
                
        if 'pages' in article:
            pages = article['pages']
            pageCount = pagecount(pages)
            startPage = startpage(pages)
        else:
            pageCount = -1
            startPage = -1
        successes += 1
    except TypeError:
        raise
    except:
        print(sys.exc_info()[0])
        failures += 1
        raise

    if countPaper(confname, year, volume, number, pages, startPage, pageCount, url, title):
        totalPapers += 1
        for authorName in authorList:
            aName = ""
            if type(authorName) is collections.OrderedDict:
                aName = authorName["#text"] # type: ignore
            elif type(authorName) is str:
                aName = authorName
            realName = aliasdict.get(aName, aName)
            affiliation = ""
            if realName in facultydict:
                affiliation = facultydict[realName]
            elif realName in aliasdict:
                affiliation = facultydict[aliasdict[realName]]
            elif realName in reversealiasdict:
                affiliation = facultydict[reversealiasdict[realName]]
            facultydict[realName] = affiliation
            
            if (affiliation and (realName in facultydict or realName in aliasdict or realName in reversealiasdict)) or args.all:
                log : LogType = { 'name' : realName.encode('utf-8'),
                                  'year' : year,
                                  'title' : title.encode('utf-8'),
                                  'conf' : confname,
                                  'area' : areaname,
                                  'institution' : affiliation,
                                  'numauthors' : authorsOnPaper,
                                  'volume' : volume,
                                  'number' : number,
                                  'startPage' : startPage,
                                  'pageCount' : pageCount }
                tmplist : List[LogType] = authlogs.get(realName, [])
                tmplist.append(log)
                authlogs[realName] = tmplist
                interestingauthors[realName] += 1
                authorscores[(realName, areaname, year)] += 1.0
                authorscoresAdjusted[(realName, areaname, year)] += 1.0 / authorsOnPaper
    return True
def parseDBLP(facultydict):
    authlogs = {}
    interestingauthors = {}
    authorscores = {}
    authorscoresAdjusted = {}
    coauthors = {}
    papersWritten = {}
    counter = 0

    # with open('dblp.xml', mode='rb') as f:
    with gzip.open('dblp.xml.gz') as f:

        oldnode = None

        dtd = ElementTree.DTD(file='dblp.dtd')       
        for (event, node) in ElementTree.iterparse(f, events=['start', 'end'], load_dtd=True):
            if (oldnode is not None):
                oldnode.clear()
            oldnode = node

            foundArticle = False
            authorsOnPaper = 0
            authorName = ""
            authorList = []
            confname = ""
            title = ""
            year = -1
            pageCount = -1
            startPage = -1
            foundOneInDict = False
            number = 0
            volume = 0
            
            if node.tag == 'inproceedings' or node.tag == 'article':

                for child in node:
                    if child.tag == 'booktitle' or child.tag == 'journal':
                        if child.text is not None:
                            confname = child.text
                            if confname in confdict:
                                areaname = confdict[confname]
                                foundArticle = True
                    elif child.tag == 'title':
                        if child.text is not None:
                            title = child.text
                    elif child.tag == 'volume':
                        volume = child.text
                    elif child.tag == 'number':
                        number = child.text
                    elif child.tag == 'year':
                        if child.text is not None:
                            year = int(child.text)
                    elif child.tag == 'pages':
                        pageCount = pagecount(child.text)
                        startPage = startpage(child.text)
                    elif child.tag == 'url':
                        url = child.text
                    elif child.tag == 'author':
                        if child.text is not None:
                            authorName = child.text
                            authorName = authorName.strip()
                            authorList.append(authorName)
                            authorsOnPaper += 1
                            if authorName in facultydict:
                                foundOneInDict = True

                
                # One of our conferences?
                if not foundArticle:
                    continue

                # Any authors in our affiliations?
                if not foundOneInDict:
                    continue

                # One of the papers we count?
                if not countPaper(confname, year, volume, number, startPage, pageCount, url):
                    continue

                # If we get here, we have a winner.

                for authorName in authorList:
                    if authorName in facultydict:
                        # print "here we go",authorName, confname, authorsOnPaper, year
                        logstring = { 'name' : authorName.encode('utf-8'),
                                      'conf' : confname,
                                      'area' : areaname,
                                      'year' : year,
                                      'title' : title.encode('utf-8'),
                                      'institution' : facultydict[authorName] }
                        tmplist = authlogs.get(authorName, [])
                        tmplist.append(logstring)
                        authlogs[authorName] = tmplist
                        interestingauthors[authorName] = interestingauthors.get(authorName, 0) + 1
                        authorscores[(authorName, areaname, year)] = authorscores.get((authorName, areaname, year), 0) + 1.0
                        authorscoresAdjusted[(authorName, areaname, year)] = authorscoresAdjusted.get((authorName, areaname, year), 0) + 1.0 / authorsOnPaper


    return (interestingauthors, authorscores, authorscoresAdjusted, authlogs)
Example #5
0
def parseDBLP(facultydict):
    authlogs = {}
    interestingauthors = {}
    authorscores = {}
    authorscoresAdjusted = {}
    coauthors = {}
    papersWritten = {}
    counter = 0

    with open('dblp.xml', mode='r') as f:

        # with gzip.open('dblp.xml.gz') as f:

        oldnode = None

        for (event, node) in ElementTree.iterparse(f, events=['start', 'end']):
            if (oldnode is not None):
                oldnode.clear()
            oldnode = node

            foundArticle = False
            authorsOnPaper = 0
            authorName = ""
            confname = ""
            title = ""
            year = -1
            pageCount = -1
            startPage = -1
            foundOneInDict = False
            number = 0
            volume = 0

            if (node.tag == 'inproceedings' or node.tag == 'article'):

                # First, check if this is one of the conferences we are looking for.

                for child in node:
                    if (child.tag == 'booktitle' or child.tag == 'journal'):
                        confname = child.text
                        if (confname in confdict):
                            areaname = confdict[confname]
                            foundArticle = True
                    if (child.tag == 'title'):
                        if child.text is not None:
                            title = child.text
                    if (child.tag == 'volume'):
                        volume = child.text
                    if (child.tag == 'number'):
                        number = child.text
                    if child.tag == 'year':
                        if child.text is not None:
                            year = int(child.text)
                    if child.tag == 'pages':
                        pageCount = pagecount(child.text)
                        startPage = startpage(child.text)
                    if child.tag == 'author':
                        authorName = child.text
                        if authorName is not None:
                            authorName = authorName.strip()
                            authorsOnPaper += 1
                            if authorName in facultydict:
                                foundOneInDict = True

                # Any authors in our affiliations?
                if not foundOneInDict:
                    continue

                # One of our conferences?
                if not foundArticle:
                    continue

                # One of the papers we count?
                if not countPaper(confname, year, volume, number, startPage,
                                  pageCount):
                    continue

                # If we get here, we have a winner.

                for child in node:
                    if child.tag == 'author':
                        authorName = child.text
                        authorName = authorName.strip()
                        if authorName in facultydict:
                            # print "here we go",authorName, confname, authorsOnPaper, year
                            logstring = authorName.encode(
                                'utf-8') + " ; " + confname + " " + str(
                                    year) + ": " + title.encode('utf-8')
                            tmplist = authlogs.get(authorName, [])
                            tmplist.append(logstring)
                            authlogs[authorName] = tmplist
                            interestingauthors[
                                authorName] = interestingauthors.get(
                                    authorName, 0) + 1
                            authorscores[(
                                authorName, areaname,
                                year)] = authorscores.get(
                                    (authorName, areaname, year), 0) + 1.0
                            authorscoresAdjusted[(
                                authorName, areaname,
                                year)] = authorscoresAdjusted.get(
                                    (authorName, areaname, year),
                                    0) + 1.0 / authorsOnPaper

    return (interestingauthors, authorscores, authorscoresAdjusted, authlogs)
def parseDBLP(facultydict):
    authlogs = {}
    interestingauthors = {}
    authorscores = {}
    authorscoresAdjusted = {}
    coauthors = {}
    papersWritten = {}
    counter = 0

    # with open('dblp.xml', mode='r') as f:

    with gzip.open('dblp.xml.gz') as f:

        oldnode = None

        for (event, node) in ElementTree.iterparse(f, events=['start', 'end']):
            if (oldnode is not None):
                oldnode.clear()
            oldnode = node

            foundArticle = True  # include all venues
            # foundArticle = False
            inRange = False
            authorsOnPaper = 0
            authorName = ""
            confname = ""
            year = -1
            pageCount = -1
            startPage = -1
            foundOneInDict = False
            number = 0
            volume = 0

            if (node.tag == 'inproceedings' or node.tag == 'article'):

                # First, check if this is one of the conferences we are looking for.

                for child in node:
                    if (child.tag == 'booktitle' or child.tag == 'journal'):
                        confname = child.text
                        if True:  # INCLUDE ALL VENUES
                            # if (confname in confdict):
                            foundArticle = True
                    if (child.tag == 'volume'):
                        volume = child.text
                    if (child.tag == 'number'):
                        number = child.text
                    if child.tag == 'year':
                        if child.text is not None:
                            year = int(child.text)
                    if child.tag == 'pages':
                        pageCount = pagecount(child.text)
                        startPage = startpage(child.text)
                    if child.tag == 'author':
                        authorName = child.text
                        if authorName is not None:
                            authorName = authorName.strip()
                            authorsOnPaper += 1
                            if authorName in facultydict:
                                foundOneInDict = True

                if (not foundArticle):
                    # Not one of our conferences.
                    continue

                if confname is None:
                    continue

                if not confname in confdict:
                    areaname = "na"
                else:
                    areaname = confdict[confname]

                # Check that dates are in the specified range.
                if ((year >= startyear) and (year <= endyear)):
                    inRange = True

                if year == -1:
                    # No year.
                    continue

                tooFewPages = False
                if ((pageCount != -1) and (pageCount < pageCountThreshold)):
                    tooFewPages = True
                    exceptionConference = confname == 'SC'
                    exceptionConference |= confname == 'SIGSOFT FSE' and year == 2012
                    exceptionConference |= confname == 'ACM Trans. Graph.' and int(
                        volume) >= 26 and int(volume) <= 36
                    if exceptionConference:
                        tooFewPages = False

                if (not inRange) or (not foundOneInDict) or tooFewPages:
                    continue

                # If we got here, we have a winner.

                for child in node:
                    if child.tag == 'author':
                        authorName = child.text
                        authorName = authorName.strip()
                        if authorName in facultydict:
                            print "here we go", authorName, confname, authorsOnPaper, year
                            logstring = authorName.encode(
                                'utf-8') + " ; " + confname.encode(
                                    'utf-8') + " " + str(year)
                            tmplist = authlogs.get(authorName, [])
                            tmplist.append(logstring)
                            authlogs[authorName] = tmplist
                            interestingauthors[
                                authorName] = interestingauthors.get(
                                    authorName, 0) + 1
                            authorscores[(
                                authorName, areaname,
                                year)] = authorscores.get(
                                    (authorName, areaname, year), 0) + 1.0
                            authorscoresAdjusted[(
                                authorName, areaname,
                                year)] = authorscoresAdjusted.get(
                                    (authorName, areaname, year),
                                    0) + 1.0 / authorsOnPaper

    return (interestingauthors, authorscores, authorscoresAdjusted, authlogs)
Example #7
0
def parseDBLP(facultydict):
    authlogs = {}
    interestingauthors = {}
    authorscores = {}
    authorscoresAdjusted = {}
    coauthors = {}
    papersWritten = {}
    counter = 0

    with open('dblp.xml', mode='r') as f:

        # with gzip.open('dblp.xml.gz') as f:

        oldnode = None

        for (event, node) in ElementTree.iterparse(f, events=['start', 'end']):
            if (oldnode is not None):
                oldnode.clear()
            oldnode = node

            foundArticle = False
            inRange = False
            authorsOnPaper = 0
            authorName = ""
            confname = ""
            year = -1
            pageCount = -1
            foundOneInDict = False

            if (node.tag == 'inproceedings' or node.tag == 'article'):

                # First, check if this is one of the conferences we are looking for.

                for child in node:
                    if (child.tag == 'booktitle' or child.tag == 'journal'):
                        confname = child.text
                        if (child.text in confdict):
                            foundArticle = True
                        break

                if not foundArticle:
                    if confname is not None:
                        for a in venues:
                            if (a in confname):
                                # print "WHOA: " + a + " --> " + confname
                                break

                if (not foundArticle):
                    # Not one of our conferences.
                    continue

                # Check that dates are in the specified range.
                for child in node:
                    if child.tag == 'year':  #  and type(child.text) is str):
                        year = int(child.text)
                        if ((year >= startyear) and (year <= endyear)):
                            inRange = True
                        break

                if year == -1:
                    # No year.
                    print "NO YEAR WAT", confname
                    continue

                # Count up how many authors are on this paper.
                for child in node:
                    if child.tag == 'author':
                        authorName = child.text
                        authorName = authorName.strip()
                        authorsOnPaper += 1
                        if authorName in facultydict:
                            foundOneInDict = True

                # Count the number of pages. It needs to exceed our threshold to be considered.
                for child in node:
                    if child.tag == 'pages':
                        pageCount = pagecount(child.text)

                tooFewPages = False
                if ((pageCount != -1) and (pageCount < pageCountThreshold)):
                    tooFewPages = True
                    exceptionConference = ((confname == 'SC')
                                           or (confname == 'SIGSOFT FSE')
                                           or (confname == 'PLDI') or
                                           (confname == 'ACM Trans. Graph.'))
                    if ((pageCount == 0) and exceptionConference):
                        tooFewPages = False
                    # SPECIAL CASE FOR conferences that have incorrect entries (as of 6/22/2016).
                    # Only skip papers with a very small paper count,
                    # but above 1. Why?
                    # DBLP has real papers with incorrect page counts
                    # - usually a truncated single page. -1 means no
                    # pages found at all => some problem with journal
                    # entries in DBLP.
                    # print "Skipping article with "+str(pageCount)+" pages."

                if ((confname == 'ASE') and (pageCount <= 6)):
                    tooFewPages = True

                areaname = confdict[confname]

                if (not inRange) or (not foundOneInDict) or tooFewPages:
                    continue

                # If we got here, we have a winner.

                for child in node:
                    if child.tag == 'author':
                        authorName = child.text
                        authorName = authorName.strip()
                        if authorName in facultydict:
                            # print "here we go",authorName, confname, authorsOnPaper, year
                            logstring = authorName.encode(
                                'utf-8') + " ; " + confname + " " + str(year)
                            tmplist = authlogs.get(authorName, [])
                            tmplist.append(logstring)
                            authlogs[authorName] = tmplist
                            interestingauthors[
                                authorName] = interestingauthors.get(
                                    authorName, 0) + 1
                            authorscores[(
                                authorName, areaname,
                                year)] = authorscores.get(
                                    (authorName, areaname, year), 0) + 1.0
                            authorscoresAdjusted[(
                                authorName, areaname,
                                year)] = authorscoresAdjusted.get(
                                    (authorName, areaname, year),
                                    0) + 1.0 / authorsOnPaper

    return (interestingauthors, authorscores, authorscoresAdjusted, authlogs)
Example #8
0
def parseDBLP(facultydict):
    authlogs = {}
    interestingauthors = {}
    authorscores = {}
    authorscoresAdjusted = {}

    with gzip.open("dblp.xml.gz") as f:

        oldnode = None

        for (event, node) in ElementTree.iterparse(f, events=["start", "end"]):
            if oldnode is not None:
                oldnode.clear()
            oldnode = node

            foundArticle = True  # include all venues
            # foundArticle = False
            inRange = False
            authorsOnPaper = 0
            authorName = ""
            confname = ""
            year = -1
            pageCount = -1
            foundOneInDict = False
            volume = 0

            if node.tag == "inproceedings" or node.tag == "article":

                # First, check if this is one of the conferences we are looking for.

                for child in node:
                    if child.tag == "booktitle" or child.tag == "journal":
                        confname = child.text
                        if True:  # INCLUDE ALL VENUES
                            # was: if (confname in confdict):
                            foundArticle = True
                    if child.tag == "volume":
                        volume = child.text
                    if child.tag == "year":
                        if child.text is not None:
                            year = int(child.text)
                    if child.tag == "pages":
                        pageCount = csrankings.pagecount(child.text)
                    if child.tag == "author":
                        authorName = child.text
                        if authorName is not None:
                            authorName = authorName.strip()
                            authorsOnPaper += 1
                            if authorName in facultydict:
                                foundOneInDict = True

                if not foundArticle:
                    # Not one of our conferences.
                    continue

                if confname is None:
                    continue

                if confname not in csrankings.confdict:
                    areaname = "na"
                else:
                    areaname = csrankings.confdict[confname]

                # Check that dates are in the specified range.
                if (year >= startyear) and (year <= endyear):
                    inRange = True

                if year == -1:
                    # No year.
                    continue

                tooFewPages = False
                if (pageCount != -1) and (
                    pageCount < csrankings.pageCountThreshold
                ):
                    tooFewPages = True
                    exceptionConference = confname == "SC"
                    exceptionConference |= (
                        confname == "SIGSOFT FSE" and year == 2012
                    )
                    exceptionConference |= (
                        confname == "ACM Trans. Graph."
                        and int(volume) >= 26
                        and int(volume) <= 36
                    )
                    if exceptionConference:
                        tooFewPages = False

                if (not inRange) or (not foundOneInDict) or tooFewPages:
                    continue

                # If we got here, we have a winner.

                for child in node:
                    if child.tag == "author":
                        authorName = child.text
                        authorName = authorName.strip()
                        if authorName in facultydict:
                            print(
                                "here we go"
                                + authorName
                                + " "
                                + confname
                                + " "
                                + str(authorsOnPaper)
                                + " "
                                + str(year)
                            )
                            logstring = authorName.encode("utf-8")
                            logstring += " ; ".encode("utf-8")
                            logstring += confname.encode("utf-8")
                            logstring += " ".encode("utf-8")
                            logstring += str(year).encode("utf-8")
                            tmplist = authlogs.get(authorName, [])
                            tmplist.append(logstring)
                            authlogs[authorName] = tmplist
                            interestingauthors[authorName] = (
                                interestingauthors.get(authorName, 0) + 1
                            )
                            authorscores[(authorName, areaname, year)] = (
                                authorscores.get(
                                    (authorName, areaname, year), 0
                                )
                                + 1.0
                            )
                            authorscoresAdjusted[
                                (authorName, areaname, year)
                            ] = (
                                authorscoresAdjusted.get(
                                    (authorName, areaname, year), 0
                                )
                                + 1.0 / authorsOnPaper
                            )

    return (interestingauthors, authorscores, authorscoresAdjusted, authlogs)
def parseDBLP():
    authlogs = {}
    interestingauthors = {}
    authorscores = {}
    authorscoresAdjusted = {}
    coauthors = {}
    papersWritten = {}
    allpapers = []
    counter = 0

    with gzip.open('generated/dblp/dblp.xml.gz', mode='r') as f:
    # with open('generated/foo/foo.xml', mode='r') as f:

        oldnode = None

        for (event, node) in ElementTree.iterparse(f, events=['start', 'end']):
            if (oldnode is not None):
                oldnode.clear()
            oldnode = node

            foundArticle = False
            authorsOnPaper = 0
            authorName = ""
            confname = ""
            year = -1
            pageCount = -1
            startPage = -1
            number = 0
            volume = 0
            paperinfo = dict()
            paperinfo['authors'] = list()

            if (node.tag == 'inproceedings' or node.tag == 'article'):

                paperinfo["dblp"] = node.get("key")

                # First, check if this is one of the conferences we are looking for.

                for child in node:
                    if (child.tag == 'booktitle' or child.tag == 'journal'):
                        confname = child.text
                        if (confname in confdict):
                            areaname = confdict[confname]
                            paperinfo['area'] = areaname
                            foundArticle = True
                        if (confname in conf2confdict):
                            paperinfo['venue'] = conf2confdict[confname]
                        else:
                            paperinfo['venue'] = confname
                    if (child.tag == 'volume'):
                        volume = child.text
                    if (child.tag == 'number'):
                        number = child.text
                    if (child.tag == 'title'):
                        # this way instead of child.text as the latter breaks if the title contains HTML
                        # this way strips all XML/HTML tags from within the title
                        paperinfo['title'] = ElementTree.tostring(child, method="text", encoding="utf-8").strip(" \n\t.")
                        paperinfo['title'] = paperinfo['title'].decode('latin1')
                    if child.tag == 'year':
                        if child.text is not None:
                            year = int(child.text)
                            paperinfo['year'] = year
                    if child.tag == 'pages':
                        pageCount = pagecount(child.text)
                        startPage = startpage(child.text)
                    if child.tag == 'url':
                        # sometimes this is None, even when there is clearly
                        #  a URL in the xml file. I cannot replicate this on a small
                        #  example, so I have no idea what is going on
                        paperinfo["url"] = child.text
                    if child.tag == 'author':
                        authorName = child.text
                        if authorName is not None:
                            authorName = unicode(authorName).strip()
                            paperinfo['authors'].append(authorName)
                            authorsOnPaper += 1

                # One of our conferences?
                if not foundArticle:
                    continue

                # One of the papers we count?
                if not countPaper(confname, year, volume, number, startPage, pageCount):
                    continue

                # sanity check for errors where no title shows up
                #  (detects any recurrences of a bug where titles weren't included if contained XML
                if not paperinfo.get('title', False):
                    print ElementTree.dump (node)
                    print paperinfo
                    raise Exception("No title")

                # If we get here, we have a winner.

                for child in node:
                    if child.tag == 'author':
                        authorName = child.text
                        authorName = authorName.strip()
                        if True:
                            # print "here we go",authorName, confname, authorsOnPaper, year
                            logstring = authorName.encode('utf-8') + " ; " + confname + " " + str(year)
                            tmplist = authlogs.get(authorName, [])
                            tmplist.append(logstring)
                            authlogs[authorName] = tmplist
                            interestingauthors[authorName] = interestingauthors.get(authorName, 0) + 1
                            authorscores[(authorName, areaname, year)] = authorscores.get((authorName, areaname, year), 0) + 1.0
                            authorscoresAdjusted[(authorName, areaname, year)] = authorscoresAdjusted.get((authorName, areaname, year), 0) + 1.0 / authorsOnPaper

                # record all paper info for logging
                allpapers.append(paperinfo)

    return (allpapers, interestingauthors, authorscores, authorscoresAdjusted, authlogs)