Python pagecount Examples

Programming Language: Python

Namespace/Package Name: csrankings

Method/Function: pagecount

Examples at hotexamples.com: 9

Python pagecount - 9 examples found. These are the top rated real world Python examples of csrankings.pagecount extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def handle_article(_: Any, article: ArticleType) -> bool:  # type: ignore
    global counter
    global successes
    global failures
    global totalPapers
    counter += 1
    try:
        if counter % 10000 == 0:
            print(f"{counter} papers processed.")
        if "author" not in article:
            return True
        # Fix if there is just one author.
        authorList: List[str] = []
        if type(article["author"]) == list:
            authorList = article["author"]
        elif type(article["author"]) == str:
            authorList = [str(article["author"])]
        elif (type(article["author"]) is OrderedDict
              or type(article["author"]) is dict):
            authorList = [article["author"]["#text"]]  # type: ignore
        else:
            print("***Unknown record type, skipping.***")
            return True
        authorsOnPaper = len(authorList)
        foundOneInDict = False or args.all
        if not args.all:
            for authorName in authorList:
                if (type(authorName) is OrderedDict
                        or type(authorName) is dict):
                    aName = authorName["#text"]  # type: ignore
                else:
                    aName = authorName
                aName = aName.strip()
                if aName in facultydict or args.all:
                    foundOneInDict = True
                    break
                with contextlib.suppress(KeyError):
                    if aliasdict[aName] in facultydict:
                        foundOneInDict = True
                        break
                    if reversealiasdict[aName] in facultydict:
                        foundOneInDict = True
                        break
            if not foundOneInDict:
                return True
        if "booktitle" in article:
            confname = Conference(article["booktitle"])
        elif "journal" in article:
            confname = Conference(article["journal"])
        else:
            return True

        if args.conference not in confname:
            return True

        if confname not in confdict:
            return True

        volume = article.get("volume", "0")
        number = article.get("number", "0")
        url = article.get("url", "")
        year = int(article.get("year", "-1"))
        pages = ""

        areaname = confdict[confname]
        # Special handling for PACMPL
        if areaname == Area("pacmpl"):
            confname = Conference(article["number"])
            if confname in confdict:
                areaname = confdict[confname]
            else:
                return True
        elif confname == Conference("ACM Trans. Graph."):
            if year in TOG_SIGGRAPH_Volume:
                (vol, num) = TOG_SIGGRAPH_Volume[year]
                if (volume == str(vol)) and (number == str(num)):
                    confname = Conference("SIGGRAPH")
                    areaname = confdict[confname]
            if year in TOG_SIGGRAPH_Asia_Volume:
                (vol, num) = TOG_SIGGRAPH_Asia_Volume[year]
                if (volume == str(vol)) and (number == str(num)):
                    confname = Conference("SIGGRAPH Asia")
                    areaname = confdict[confname]
        elif confname == "IEEE Trans. Vis. Comput. Graph.":
            if year in TVCG_Vis_Volume:
                (vol, num) = TVCG_Vis_Volume[year]
                if (volume == str(vol)) and (number == str(num)):
                    areaname = Area("vis")
            if year in TVCG_VR_Volume:
                (vol, num) = TVCG_VR_Volume[year]
                if (volume == str(vol)) and (number == str(num)):
                    confname = Conference("VR")
                    areaname = Area("vr")

        if "title" in article:
            title = Title("")
            if (type(article["title"]) is OrderedDict
                    or type(article["title"]) is dict):
                title = Title(article["title"]["#text"])  # type: ignore
            else:
                title = Title(article["title"])

        if "pages" in article:
            pages = article["pages"]
            pageCount = pagecount(pages)
            startPage = startpage(pages)
        else:
            pageCount = -1
            startPage = -1
        successes += 1
    except TypeError:
        raise
    except BaseException:
        print(sys.exc_info()[0])
        failures += 1
        raise

    if countPaper(confname, year, volume, number, pages, startPage, pageCount,
                  url, title):
        totalPapers += 1
        for authorName in authorList:
            aName = ""
            if type(authorName) is OrderedDict or type(authorName) is dict:
                aName = authorName["#text"]  # type: ignore
            elif type(authorName) is str:
                aName = authorName
            realName = aliasdict.get(aName, aName)
            affiliation = ""
            if realName in facultydict:
                affiliation = facultydict[realName]
            elif realName in aliasdict:
                affiliation = facultydict[aliasdict[realName]]
            elif realName in reversealiasdict:
                affiliation = facultydict[reversealiasdict[realName]]
            facultydict[realName] = affiliation

            if (affiliation and
                (realName in facultydict or realName in aliasdict
                 or realName in reversealiasdict)) or args.all:
                log: LogType = {
                    "name": realName.encode("utf-8"),
                    "year": year,
                    "title": title.encode("utf-8"),
                    "conf": confname,
                    "area": areaname,
                    "institution": affiliation,
                    "numauthors": authorsOnPaper,
                    "volume": volume,
                    "number": number,
                    "startPage": startPage,
                    "pageCount": pageCount,
                }
                tmplist: List[LogType] = authlogs.get(realName, [])
                tmplist.append(log)
                authlogs[realName] = tmplist
                interestingauthors[realName] += 1
                authorscores[(realName, areaname, year)] += 1.0
                authorscoresAdjusted[(realName, areaname,
                                      year)] += 1.0 / authorsOnPaper
    return True

Example #2

Show file

def parseDBLP(facultydict):
    authlogs = {}
    interestingauthors = {}
    authorscores = {}
    authorscoresAdjusted = {}
    coauthors = {}
    papersWritten = {}
    counter = 0

    with open('dblp.xml', mode='r') as f:

        # with gzip.open('dblp.xml.gz') as f:

        oldnode = None

        for (event, node) in ElementTree.iterparse(f, events=['start', 'end']):
            if (oldnode is not None):
                oldnode.clear()
            oldnode = node

            foundArticle = False
            inRange = False
            authorsOnPaper = 0
            authorName = ""
            confname = ""
            year = -1
            pageCount = -1
            startPage = -1
            foundOneInDict = False
            number = 0
            volume = 0

            if (node.tag == 'inproceedings' or node.tag == 'article'):

                # First, check if this is one of the conferences we are looking for.

                for child in node:
                    if (child.tag == 'booktitle' or child.tag == 'journal'):
                        confname = child.text
                        if (confname in confdict):
                            foundArticle = True
                    if (child.tag == 'volume'):
                        volume = child.text
                    if (child.tag == 'number'):
                        number = child.text
                    if child.tag == 'year':
                        if child.text is not None:
                            year = int(child.text)
                    if child.tag == 'pages':
                        pageCount = pagecount(child.text)
                        startPage = startpage(child.text)
                    if child.tag == 'author':
                        authorName = child.text
                        if authorName is not None:
                            authorName = authorName.strip()
                            authorsOnPaper += 1
                            if authorName in facultydict:
                                foundOneInDict = True

                if (not foundArticle):
                    # Not one of our conferences.
                    continue

                areaname = confdict[confname]

                # Special handling for ISMB.
                if (confname == 'Bioinformatics'):
                    if ISMB_Bioinformatics.has_key(year):
                        (vol, num) = ISMB_Bioinformatics[year]
                        if (volume != str(vol)) or (number != str(num)):
                            continue
                    else:
                        continue

                # Special handling for ICSE.
                if ((confname == 'ICSE') or (confname == 'ICSE (1)')
                        or (confname == 'ICSE (2)')):
                    if ICSE_ShortPaperStart.has_key(year):
                        pageno = ICSE_ShortPaperStart[year]
                        if startPage >= pageno:
                            # Omit papers that start at or beyond this page,
                            # since they are "short papers" (regardless of their length).
                            continue

                # Check that dates are in the specified range.
                if ((year >= startyear) and (year <= endyear)):
                    inRange = True

                if year == -1:
                    # No year.
                    print "NO YEAR WAT", confname
                    continue

                tooFewPages = False
                if ((pageCount != -1) and (pageCount < pageCountThreshold)):
                    tooFewPages = True
                    exceptionConference = ((confname == 'SC')
                                           or (confname == 'SIGSOFT FSE')
                                           or (confname == 'PLDI') or
                                           (confname == 'ACM Trans. Graph.'))
                    if ((pageCount == 0) and exceptionConference):
                        tooFewPages = False

                    # SPECIAL CASE FOR conferences that have incorrect entries (as of 6/22/2016).
                    # Only skip papers with a very small paper count,
                    # but above 1. Why?
                    # DBLP has real papers with incorrect page counts
                    # - usually a truncated single page. -1 means no
                    # pages found at all => some problem with journal
                    # entries in DBLP.
                    # print "Skipping article with "+str(pageCount)+" pages."

                for child in node:
                    if child.tag == 'author':
                        authorName = child.text
                        # if authorName is not None:
                        # print authorName.encode('utf-8') + "," + areaname + "," + str(volume) + "," + str(number) + "," + str(year) + "," + str(pageCount) + "," + str(startPage) + "," + str(authorsOnPaper)

                if ((confname == 'ASE') and (pageCount <= 6)):
                    tooFewPages = True

                if (not inRange) or (not foundOneInDict) or tooFewPages:
                    continue

                # If we got here, we have a winner.

                for child in node:
                    if child.tag == 'author':
                        authorName = child.text
                        authorName = authorName.strip()
                        if authorName in facultydict:
                            # print "here we go",authorName, confname, authorsOnPaper, year
                            logstring = authorName.encode(
                                'utf-8') + " ; " + confname + " " + str(year)
                            tmplist = authlogs.get(authorName, [])
                            tmplist.append(logstring)
                            authlogs[authorName] = tmplist
                            interestingauthors[
                                authorName] = interestingauthors.get(
                                    authorName, 0) + 1
                            authorscores[(
                                authorName, areaname,
                                year)] = authorscores.get(
                                    (authorName, areaname, year), 0) + 1.0
                            authorscoresAdjusted[(
                                authorName, areaname,
                                year)] = authorscoresAdjusted.get(
                                    (authorName, areaname, year),
                                    0) + 1.0 / authorsOnPaper

    return (interestingauthors, authorscores, authorscoresAdjusted, authlogs)

Example #3

Show file

File: regenerate_data.py Project: fredsala/CSrankings

def handle_article(_ : Any, article : ArticleType) -> bool: # type: ignore
    global counter
    global successes
    global failures
    global totalPapers
    counter += 1
    try:
        if counter % 10000 == 0:
            print(str(counter)+ " papers processed.")
        if not 'author' in article:
            return True
        # Fix if there is just one author.
        authorList : List[str] = []
        if type(article['author']) == list:
            authorList = article['author']
        else:
            if type(article['author']) == str:
                authorList = [str(article['author'])]
            elif type(article['author']) is collections.OrderedDict:
                authorList = [article['author']["#text"]] # type: ignore
            else:
                print("***Unknown record type, skipping.***")
                return True
        authorsOnPaper = len(authorList)
        foundOneInDict = False or args.all
        if not args.all:
            for authorName in authorList:
                if type(authorName) is collections.OrderedDict:
                    aName = authorName["#text"] # type: ignore
                else:
                    aName = authorName
                aName = aName.strip()
                if aName in facultydict or args.all:
                    foundOneInDict = True
                    break
                try:
                    if aliasdict[aName] in facultydict:
                        foundOneInDict = True
                        break
                    if reversealiasdict[aName] in facultydict:
                        foundOneInDict = True
                        break
                except:
                    pass
                        
            if not foundOneInDict:
                return True
        if 'booktitle' in article:
            confname = Conference(article['booktitle'])
        elif 'journal' in article:
            confname = Conference(article['journal'])
        else:
            return True

        if not args.conference in confname:
            return True
        
        if not confname in confdict:
            return True
        
        volume = article.get('volume',"0")
        number = article.get('number',"0")
        url    = article.get('url',"")
        year   = int(article.get('year',"-1"))
        pages  = ""
        
        areaname = confdict[confname]
        #Special handling for PACMPL
        if areaname == Area('pacmpl'):
            confname = Conference(article['number'])
            if confname in confdict:
                areaname = confdict[confname]
            else:
                return True
        elif confname == Conference('ACM Trans. Graph.'):
            if year in TOG_SIGGRAPH_Volume:
                (vol, num) = TOG_SIGGRAPH_Volume[year]
                if (volume == str(vol)) and (number == str(num)):
                    confname = Conference('SIGGRAPH')
                    areaname = confdict[confname]
            if year in TOG_SIGGRAPH_Asia_Volume:
                (vol, num) = TOG_SIGGRAPH_Asia_Volume[year]
                if (volume == str(vol)) and (number == str(num)):
                    confname = Conference('SIGGRAPH Asia')
                    areaname = confdict[confname]
        elif confname == 'IEEE Trans. Vis. Comput. Graph.':
            if year in TVCG_Vis_Volume:
                (vol, num) = TVCG_Vis_Volume[year]
                if (volume == str(vol)) and (number == str(num)):
                    areaname = Area('vis')
            if year in TVCG_VR_Volume:
                (vol, num) = TVCG_VR_Volume[year]
                if (volume == str(vol)) and (number == str(num)):
                    confname = Conference('VR')
                    areaname = Area('vr')

        if 'title' in article:
            title = Title("")
            if type(article['title']) is collections.OrderedDict:
                title = Title(article['title']["#text"]) # type: ignore
            else:
                title = Title(article['title'])
                
        if 'pages' in article:
            pages = article['pages']
            pageCount = pagecount(pages)
            startPage = startpage(pages)
        else:
            pageCount = -1
            startPage = -1
        successes += 1
    except TypeError:
        raise
    except:
        print(sys.exc_info()[0])
        failures += 1
        raise

    if countPaper(confname, year, volume, number, pages, startPage, pageCount, url, title):
        totalPapers += 1
        for authorName in authorList:
            aName = ""
            if type(authorName) is collections.OrderedDict:
                aName = authorName["#text"] # type: ignore
            elif type(authorName) is str:
                aName = authorName
            realName = aliasdict.get(aName, aName)
            affiliation = ""
            if realName in facultydict:
                affiliation = facultydict[realName]
            elif realName in aliasdict:
                affiliation = facultydict[aliasdict[realName]]
            elif realName in reversealiasdict:
                affiliation = facultydict[reversealiasdict[realName]]
            facultydict[realName] = affiliation
            
            if (affiliation and (realName in facultydict or realName in aliasdict or realName in reversealiasdict)) or args.all:
                log : LogType = { 'name' : realName.encode('utf-8'),
                                  'year' : year,
                                  'title' : title.encode('utf-8'),
                                  'conf' : confname,
                                  'area' : areaname,
                                  'institution' : affiliation,
                                  'numauthors' : authorsOnPaper,
                                  'volume' : volume,
                                  'number' : number,
                                  'startPage' : startPage,
                                  'pageCount' : pageCount }
                tmplist : List[LogType] = authlogs.get(realName, [])
                tmplist.append(log)
                authlogs[realName] = tmplist
                interestingauthors[realName] += 1
                authorscores[(realName, areaname, year)] += 1.0
                authorscoresAdjusted[(realName, areaname, year)] += 1.0 / authorsOnPaper
    return True

Example #4

Show file

File: regenerate-data.py Project: saadmahboob/CSrankings

def parseDBLP(facultydict):
    authlogs = {}
    interestingauthors = {}
    authorscores = {}
    authorscoresAdjusted = {}
    coauthors = {}
    papersWritten = {}
    counter = 0

    # with open('dblp.xml', mode='rb') as f:
    with gzip.open('dblp.xml.gz') as f:

        oldnode = None

        dtd = ElementTree.DTD(file='dblp.dtd')       
        for (event, node) in ElementTree.iterparse(f, events=['start', 'end'], load_dtd=True):
            if (oldnode is not None):
                oldnode.clear()
            oldnode = node

            foundArticle = False
            authorsOnPaper = 0
            authorName = ""
            authorList = []
            confname = ""
            title = ""
            year = -1
            pageCount = -1
            startPage = -1
            foundOneInDict = False
            number = 0
            volume = 0
            
            if node.tag == 'inproceedings' or node.tag == 'article':

                for child in node:
                    if child.tag == 'booktitle' or child.tag == 'journal':
                        if child.text is not None:
                            confname = child.text
                            if confname in confdict:
                                areaname = confdict[confname]
                                foundArticle = True
                    elif child.tag == 'title':
                        if child.text is not None:
                            title = child.text
                    elif child.tag == 'volume':
                        volume = child.text
                    elif child.tag == 'number':
                        number = child.text
                    elif child.tag == 'year':
                        if child.text is not None:
                            year = int(child.text)
                    elif child.tag == 'pages':
                        pageCount = pagecount(child.text)
                        startPage = startpage(child.text)
                    elif child.tag == 'url':
                        url = child.text
                    elif child.tag == 'author':
                        if child.text is not None:
                            authorName = child.text
                            authorName = authorName.strip()
                            authorList.append(authorName)
                            authorsOnPaper += 1
                            if authorName in facultydict:
                                foundOneInDict = True

                
                # One of our conferences?
                if not foundArticle:
                    continue

                # Any authors in our affiliations?
                if not foundOneInDict:
                    continue

                # One of the papers we count?
                if not countPaper(confname, year, volume, number, startPage, pageCount, url):
                    continue

                # If we get here, we have a winner.

                for authorName in authorList:
                    if authorName in facultydict:
                        # print "here we go",authorName, confname, authorsOnPaper, year
                        logstring = { 'name' : authorName.encode('utf-8'),
                                      'conf' : confname,
                                      'area' : areaname,
                                      'year' : year,
                                      'title' : title.encode('utf-8'),
                                      'institution' : facultydict[authorName] }
                        tmplist = authlogs.get(authorName, [])
                        tmplist.append(logstring)
                        authlogs[authorName] = tmplist
                        interestingauthors[authorName] = interestingauthors.get(authorName, 0) + 1
                        authorscores[(authorName, areaname, year)] = authorscores.get((authorName, areaname, year), 0) + 1.0
                        authorscoresAdjusted[(authorName, areaname, year)] = authorscoresAdjusted.get((authorName, areaname, year), 0) + 1.0 / authorsOnPaper


    return (interestingauthors, authorscores, authorscoresAdjusted, authlogs)

Example #5

Show file

def parseDBLP(facultydict):
    authlogs = {}
    interestingauthors = {}
    authorscores = {}
    authorscoresAdjusted = {}
    coauthors = {}
    papersWritten = {}
    counter = 0

    with open('dblp.xml', mode='r') as f:

        # with gzip.open('dblp.xml.gz') as f:

        oldnode = None

        for (event, node) in ElementTree.iterparse(f, events=['start', 'end']):
            if (oldnode is not None):
                oldnode.clear()
            oldnode = node

            foundArticle = False
            authorsOnPaper = 0
            authorName = ""
            confname = ""
            title = ""
            year = -1
            pageCount = -1
            startPage = -1
            foundOneInDict = False
            number = 0
            volume = 0

            if (node.tag == 'inproceedings' or node.tag == 'article'):

                # First, check if this is one of the conferences we are looking for.

                for child in node:
                    if (child.tag == 'booktitle' or child.tag == 'journal'):
                        confname = child.text
                        if (confname in confdict):
                            areaname = confdict[confname]
                            foundArticle = True
                    if (child.tag == 'title'):
                        if child.text is not None:
                            title = child.text
                    if (child.tag == 'volume'):
                        volume = child.text
                    if (child.tag == 'number'):
                        number = child.text
                    if child.tag == 'year':
                        if child.text is not None:
                            year = int(child.text)
                    if child.tag == 'pages':
                        pageCount = pagecount(child.text)
                        startPage = startpage(child.text)
                    if child.tag == 'author':
                        authorName = child.text
                        if authorName is not None:
                            authorName = authorName.strip()
                            authorsOnPaper += 1
                            if authorName in facultydict:
                                foundOneInDict = True

                # Any authors in our affiliations?
                if not foundOneInDict:
                    continue

                # One of our conferences?
                if not foundArticle:
                    continue

                # One of the papers we count?
                if not countPaper(confname, year, volume, number, startPage,
                                  pageCount):
                    continue

                # If we get here, we have a winner.

                for child in node:
                    if child.tag == 'author':
                        authorName = child.text
                        authorName = authorName.strip()
                        if authorName in facultydict:
                            # print "here we go",authorName, confname, authorsOnPaper, year
                            logstring = authorName.encode(
                                'utf-8') + " ; " + confname + " " + str(
                                    year) + ": " + title.encode('utf-8')
                            tmplist = authlogs.get(authorName, [])
                            tmplist.append(logstring)
                            authlogs[authorName] = tmplist
                            interestingauthors[
                                authorName] = interestingauthors.get(
                                    authorName, 0) + 1
                            authorscores[(
                                authorName, areaname,
                                year)] = authorscores.get(
                                    (authorName, areaname, year), 0) + 1.0
                            authorscoresAdjusted[(
                                authorName, areaname,
                                year)] = authorscoresAdjusted.get(
                                    (authorName, areaname, year),
                                    0) + 1.0 / authorsOnPaper

    return (interestingauthors, authorscores, authorscoresAdjusted, authlogs)

Example #6

Show file

File: generate-all-pubs.py Project: msirivia/CSrankings-Cyprus-University-of-Technology

def parseDBLP(facultydict):
    authlogs = {}
    interestingauthors = {}
    authorscores = {}
    authorscoresAdjusted = {}
    coauthors = {}
    papersWritten = {}
    counter = 0

    # with open('dblp.xml', mode='r') as f:

    with gzip.open('dblp.xml.gz') as f:

        oldnode = None

        for (event, node) in ElementTree.iterparse(f, events=['start', 'end']):
            if (oldnode is not None):
                oldnode.clear()
            oldnode = node

            foundArticle = True  # include all venues
            # foundArticle = False
            inRange = False
            authorsOnPaper = 0
            authorName = ""
            confname = ""
            year = -1
            pageCount = -1
            startPage = -1
            foundOneInDict = False
            number = 0
            volume = 0

            if (node.tag == 'inproceedings' or node.tag == 'article'):

                # First, check if this is one of the conferences we are looking for.

                for child in node:
                    if (child.tag == 'booktitle' or child.tag == 'journal'):
                        confname = child.text
                        if True:  # INCLUDE ALL VENUES
                            # if (confname in confdict):
                            foundArticle = True
                    if (child.tag == 'volume'):
                        volume = child.text
                    if (child.tag == 'number'):
                        number = child.text
                    if child.tag == 'year':
                        if child.text is not None:
                            year = int(child.text)
                    if child.tag == 'pages':
                        pageCount = pagecount(child.text)
                        startPage = startpage(child.text)
                    if child.tag == 'author':
                        authorName = child.text
                        if authorName is not None:
                            authorName = authorName.strip()
                            authorsOnPaper += 1
                            if authorName in facultydict:
                                foundOneInDict = True

                if (not foundArticle):
                    # Not one of our conferences.
                    continue

                if confname is None:
                    continue

                if not confname in confdict:
                    areaname = "na"
                else:
                    areaname = confdict[confname]

                # Check that dates are in the specified range.
                if ((year >= startyear) and (year <= endyear)):
                    inRange = True

                if year == -1:
                    # No year.
                    continue

                tooFewPages = False
                if ((pageCount != -1) and (pageCount < pageCountThreshold)):
                    tooFewPages = True
                    exceptionConference = confname == 'SC'
                    exceptionConference |= confname == 'SIGSOFT FSE' and year == 2012
                    exceptionConference |= confname == 'ACM Trans. Graph.' and int(
                        volume) >= 26 and int(volume) <= 36
                    if exceptionConference:
                        tooFewPages = False

                if (not inRange) or (not foundOneInDict) or tooFewPages:
                    continue

                # If we got here, we have a winner.

                for child in node:
                    if child.tag == 'author':
                        authorName = child.text
                        authorName = authorName.strip()
                        if authorName in facultydict:
                            print "here we go", authorName, confname, authorsOnPaper, year
                            logstring = authorName.encode(
                                'utf-8') + " ; " + confname.encode(
                                    'utf-8') + " " + str(year)
                            tmplist = authlogs.get(authorName, [])
                            tmplist.append(logstring)
                            authlogs[authorName] = tmplist
                            interestingauthors[
                                authorName] = interestingauthors.get(
                                    authorName, 0) + 1
                            authorscores[(
                                authorName, areaname,
                                year)] = authorscores.get(
                                    (authorName, areaname, year), 0) + 1.0
                            authorscoresAdjusted[(
                                authorName, areaname,
                                year)] = authorscoresAdjusted.get(
                                    (authorName, areaname, year),
                                    0) + 1.0 / authorsOnPaper

    return (interestingauthors, authorscores, authorscoresAdjusted, authlogs)

Example #7

Show file

def parseDBLP(facultydict):
    authlogs = {}
    interestingauthors = {}
    authorscores = {}
    authorscoresAdjusted = {}
    coauthors = {}
    papersWritten = {}
    counter = 0

    with open('dblp.xml', mode='r') as f:

        # with gzip.open('dblp.xml.gz') as f:

        oldnode = None

        for (event, node) in ElementTree.iterparse(f, events=['start', 'end']):
            if (oldnode is not None):
                oldnode.clear()
            oldnode = node

            foundArticle = False
            inRange = False
            authorsOnPaper = 0
            authorName = ""
            confname = ""
            year = -1
            pageCount = -1
            foundOneInDict = False

            if (node.tag == 'inproceedings' or node.tag == 'article'):

                # First, check if this is one of the conferences we are looking for.

                for child in node:
                    if (child.tag == 'booktitle' or child.tag == 'journal'):
                        confname = child.text
                        if (child.text in confdict):
                            foundArticle = True
                        break

                if not foundArticle:
                    if confname is not None:
                        for a in venues:
                            if (a in confname):
                                # print "WHOA: " + a + " --> " + confname
                                break

                if (not foundArticle):
                    # Not one of our conferences.
                    continue

                # Check that dates are in the specified range.
                for child in node:
                    if child.tag == 'year':  #  and type(child.text) is str):
                        year = int(child.text)
                        if ((year >= startyear) and (year <= endyear)):
                            inRange = True
                        break

                if year == -1:
                    # No year.
                    print "NO YEAR WAT", confname
                    continue

                # Count up how many authors are on this paper.
                for child in node:
                    if child.tag == 'author':
                        authorName = child.text
                        authorName = authorName.strip()
                        authorsOnPaper += 1
                        if authorName in facultydict:
                            foundOneInDict = True

                # Count the number of pages. It needs to exceed our threshold to be considered.
                for child in node:
                    if child.tag == 'pages':
                        pageCount = pagecount(child.text)

                tooFewPages = False
                if ((pageCount != -1) and (pageCount < pageCountThreshold)):
                    tooFewPages = True
                    exceptionConference = ((confname == 'SC')
                                           or (confname == 'SIGSOFT FSE')
                                           or (confname == 'PLDI') or
                                           (confname == 'ACM Trans. Graph.'))
                    if ((pageCount == 0) and exceptionConference):
                        tooFewPages = False
                    # SPECIAL CASE FOR conferences that have incorrect entries (as of 6/22/2016).
                    # Only skip papers with a very small paper count,
                    # but above 1. Why?
                    # DBLP has real papers with incorrect page counts
                    # - usually a truncated single page. -1 means no
                    # pages found at all => some problem with journal
                    # entries in DBLP.
                    # print "Skipping article with "+str(pageCount)+" pages."

                if ((confname == 'ASE') and (pageCount <= 6)):
                    tooFewPages = True

                areaname = confdict[confname]

                if (not inRange) or (not foundOneInDict) or tooFewPages:
                    continue

                # If we got here, we have a winner.

                for child in node:
                    if child.tag == 'author':
                        authorName = child.text
                        authorName = authorName.strip()
                        if authorName in facultydict:
                            # print "here we go",authorName, confname, authorsOnPaper, year
                            logstring = authorName.encode(
                                'utf-8') + " ; " + confname + " " + str(year)
                            tmplist = authlogs.get(authorName, [])
                            tmplist.append(logstring)
                            authlogs[authorName] = tmplist
                            interestingauthors[
                                authorName] = interestingauthors.get(
                                    authorName, 0) + 1
                            authorscores[(
                                authorName, areaname,
                                year)] = authorscores.get(
                                    (authorName, areaname, year), 0) + 1.0
                            authorscoresAdjusted[(
                                authorName, areaname,
                                year)] = authorscoresAdjusted.get(
                                    (authorName, areaname, year),
                                    0) + 1.0 / authorsOnPaper

    return (interestingauthors, authorscores, authorscoresAdjusted, authlogs)

Example #8

Show file

def parseDBLP(facultydict):
    authlogs = {}
    interestingauthors = {}
    authorscores = {}
    authorscoresAdjusted = {}

    with gzip.open("dblp.xml.gz") as f:

        oldnode = None

        for (event, node) in ElementTree.iterparse(f, events=["start", "end"]):
            if oldnode is not None:
                oldnode.clear()
            oldnode = node

            foundArticle = True  # include all venues
            # foundArticle = False
            inRange = False
            authorsOnPaper = 0
            authorName = ""
            confname = ""
            year = -1
            pageCount = -1
            foundOneInDict = False
            volume = 0

            if node.tag == "inproceedings" or node.tag == "article":

                # First, check if this is one of the conferences we are looking for.

                for child in node:
                    if child.tag == "booktitle" or child.tag == "journal":
                        confname = child.text
                        if True:  # INCLUDE ALL VENUES
                            # was: if (confname in confdict):
                            foundArticle = True
                    if child.tag == "volume":
                        volume = child.text
                    if child.tag == "year":
                        if child.text is not None:
                            year = int(child.text)
                    if child.tag == "pages":
                        pageCount = csrankings.pagecount(child.text)
                    if child.tag == "author":
                        authorName = child.text
                        if authorName is not None:
                            authorName = authorName.strip()
                            authorsOnPaper += 1
                            if authorName in facultydict:
                                foundOneInDict = True

                if not foundArticle:
                    # Not one of our conferences.
                    continue

                if confname is None:
                    continue

                if confname not in csrankings.confdict:
                    areaname = "na"
                else:
                    areaname = csrankings.confdict[confname]

                # Check that dates are in the specified range.
                if (year >= startyear) and (year <= endyear):
                    inRange = True

                if year == -1:
                    # No year.
                    continue

                tooFewPages = False
                if (pageCount != -1) and (
                    pageCount < csrankings.pageCountThreshold
                ):
                    tooFewPages = True
                    exceptionConference = confname == "SC"
                    exceptionConference |= (
                        confname == "SIGSOFT FSE" and year == 2012
                    )
                    exceptionConference |= (
                        confname == "ACM Trans. Graph."
                        and int(volume) >= 26
                        and int(volume) <= 36
                    )
                    if exceptionConference:
                        tooFewPages = False

                if (not inRange) or (not foundOneInDict) or tooFewPages:
                    continue

                # If we got here, we have a winner.

                for child in node:
                    if child.tag == "author":
                        authorName = child.text
                        authorName = authorName.strip()
                        if authorName in facultydict:
                            print(
                                "here we go"
                                + authorName
                                + " "
                                + confname
                                + " "
                                + str(authorsOnPaper)
                                + " "
                                + str(year)
                            )
                            logstring = authorName.encode("utf-8")
                            logstring += " ; ".encode("utf-8")
                            logstring += confname.encode("utf-8")
                            logstring += " ".encode("utf-8")
                            logstring += str(year).encode("utf-8")
                            tmplist = authlogs.get(authorName, [])
                            tmplist.append(logstring)
                            authlogs[authorName] = tmplist
                            interestingauthors[authorName] = (
                                interestingauthors.get(authorName, 0) + 1
                            )
                            authorscores[(authorName, areaname, year)] = (
                                authorscores.get(
                                    (authorName, areaname, year), 0
                                )
                                + 1.0
                            )
                            authorscoresAdjusted[
                                (authorName, areaname, year)
                            ] = (
                                authorscoresAdjusted.get(
                                    (authorName, areaname, year), 0
                                )
                                + 1.0 / authorsOnPaper
                            )

    return (interestingauthors, authorscores, authorscoresAdjusted, authlogs)

Example #9

Show file

File: regenerate-data.py Project: lehaifeng/cs-arxiv-popularity-code

def parseDBLP():
    authlogs = {}
    interestingauthors = {}
    authorscores = {}
    authorscoresAdjusted = {}
    coauthors = {}
    papersWritten = {}
    allpapers = []
    counter = 0

    with gzip.open('generated/dblp/dblp.xml.gz', mode='r') as f:
    # with open('generated/foo/foo.xml', mode='r') as f:

        oldnode = None

        for (event, node) in ElementTree.iterparse(f, events=['start', 'end']):
            if (oldnode is not None):
                oldnode.clear()
            oldnode = node

            foundArticle = False
            authorsOnPaper = 0
            authorName = ""
            confname = ""
            year = -1
            pageCount = -1
            startPage = -1
            number = 0
            volume = 0
            paperinfo = dict()
            paperinfo['authors'] = list()

            if (node.tag == 'inproceedings' or node.tag == 'article'):

                paperinfo["dblp"] = node.get("key")

                # First, check if this is one of the conferences we are looking for.

                for child in node:
                    if (child.tag == 'booktitle' or child.tag == 'journal'):
                        confname = child.text
                        if (confname in confdict):
                            areaname = confdict[confname]
                            paperinfo['area'] = areaname
                            foundArticle = True
                        if (confname in conf2confdict):
                            paperinfo['venue'] = conf2confdict[confname]
                        else:
                            paperinfo['venue'] = confname
                    if (child.tag == 'volume'):
                        volume = child.text
                    if (child.tag == 'number'):
                        number = child.text
                    if (child.tag == 'title'):
                        # this way instead of child.text as the latter breaks if the title contains HTML
                        # this way strips all XML/HTML tags from within the title
                        paperinfo['title'] = ElementTree.tostring(child, method="text", encoding="utf-8").strip(" \n\t.")
                        paperinfo['title'] = paperinfo['title'].decode('latin1')
                    if child.tag == 'year':
                        if child.text is not None:
                            year = int(child.text)
                            paperinfo['year'] = year
                    if child.tag == 'pages':
                        pageCount = pagecount(child.text)
                        startPage = startpage(child.text)
                    if child.tag == 'url':
                        # sometimes this is None, even when there is clearly
                        #  a URL in the xml file. I cannot replicate this on a small
                        #  example, so I have no idea what is going on
                        paperinfo["url"] = child.text
                    if child.tag == 'author':
                        authorName = child.text
                        if authorName is not None:
                            authorName = unicode(authorName).strip()
                            paperinfo['authors'].append(authorName)
                            authorsOnPaper += 1

                # One of our conferences?
                if not foundArticle:
                    continue

                # One of the papers we count?
                if not countPaper(confname, year, volume, number, startPage, pageCount):
                    continue

                # sanity check for errors where no title shows up
                #  (detects any recurrences of a bug where titles weren't included if contained XML
                if not paperinfo.get('title', False):
                    print ElementTree.dump (node)
                    print paperinfo
                    raise Exception("No title")

                # If we get here, we have a winner.

                for child in node:
                    if child.tag == 'author':
                        authorName = child.text
                        authorName = authorName.strip()
                        if True:
                            # print "here we go",authorName, confname, authorsOnPaper, year
                            logstring = authorName.encode('utf-8') + " ; " + confname + " " + str(year)
                            tmplist = authlogs.get(authorName, [])
                            tmplist.append(logstring)
                            authlogs[authorName] = tmplist
                            interestingauthors[authorName] = interestingauthors.get(authorName, 0) + 1
                            authorscores[(authorName, areaname, year)] = authorscores.get((authorName, areaname, year), 0) + 1.0
                            authorscoresAdjusted[(authorName, areaname, year)] = authorscoresAdjusted.get((authorName, areaname, year), 0) + 1.0 / authorsOnPaper

                # record all paper info for logging
                allpapers.append(paperinfo)

    return (allpapers, interestingauthors, authorscores, authorscoresAdjusted, authlogs)