Beispiel #1
0
def author_papers(papers, node_id='ayjid', paper_attribs=[], **kwargs):
    """
    Generate an author_papers network NetworkX directed graph.

    ==============     =========================================================
    Element            Description
    ==============     =========================================================
    Node               Two kinds of nodes with distinguishing "type" attributes:
                       * type = paper    - a paper in papers
                       * type = person   - a person in papers
                       Papers node attributes defined by paper_attribs.
    Edge               Directed, Author -> his/her Paper.
    ==============     =========================================================

    Parameters
    ----------
    papers : list
        A list of wos_objects.
    node_id : string
        A key from :class:`.Paper` used to identify the nodes.
    paper_attribs : list
        List of user-provided optional arguments apart from the provided
        positional arguments.

    Returns
    -------
    author_papers_graph : networkx.DiGraph
        A DiGraph 'author_papers_graph'.

    Raises
    ------
    KeyError : Raised when node_id is not present in Papers.

    """
    author_papers_graph = nx.DiGraph(type='author_papers')

    # Validate node_id.
    meta_dict = ds.Paper()
    meta_keys = meta_dict.keys()
    meta_keys.remove('citations')
    if node_id not in meta_keys:
        raise KeyError('node_id' + node_id + ' cannot be used to identify' +
                       ' papers.')
    for entry in papers:
        # Define paper_attribute dictionary.
        paper_attrib_dict = util.subdict(entry, paper_attribs)
        paper_attrib_dict['type'] = 'paper'
        # Add paper node with attributes.
        author_papers_graph.add_node(entry[node_id], paper_attrib_dict)

        authors = util.concat_list(entry['aulast'], entry['auinit'], ' ')
        for i in xrange(len(authors)):
            # Add person node.
            author_papers_graph.add_node(authors[i], type="person")
            # Draw edges.
            author_papers_graph.add_edge(authors[i], entry[node_id],
                                   date=entry['date'])

    return author_papers_graph
Beispiel #2
0
def _handle_paper(article):
    """
    Yields a :class:`.Paper` from an article ET node.

    Parameters
    ----------
    article : Element
        ElementTree Element 'article'.

    Returns
    -------
    paper : :class:`.Paper`
    """
    paper = dt.Paper()
    pdata = dict_from_node(article)

    # Direct mappings.
    translator = _dfr2paper_map()
    for key, value in translator.iteritems():
        try:
            paper[value] = str(pdata[key]).upper()
        except KeyError:    # Article may not have all keys of interest.
            pass

    # Handle author names.
    paper['aulast'], paper['auinit'] = _handle_authors(pdata['author'])

    # Handle pubdate.
    paper['date'] = _handle_pubdate(pdata['pubdate'])

    # Handle pagerange.
    paper['spage'], paper['epage'] = _handle_pagerange(pdata['pagerange'])

    # Generate ayjid.
    try:
        paper['ayjid'] = _create_ayjid(paper['aulast'][0], paper['auinit'][0], \
                                       paper['date'], paper['jtitle'])
    except IndexError:  # Article may not have authors.
        pass

    return paper
Beispiel #3
0
def merge(P1, P2, fields=['ayjid']):
    """
    Combines two lists (P1 and P2) of :class:`.Paper` instances into a single
    list, and attempts to merge papers with matching fields. Where there are
    conflicts, values from :class:`.Paper` in P1 will be preferred.

    Parameters
    ----------
    P1 : list
        A list of :class:`.Paper` instances.
    P2 : list
        A list of :class:`.Paper` instances.
    fields : list
        Fields used to identify matching :class:`.Paper`

    Returns
    -------
    combined : list
        A list of :class:`.Paper` instances.

    Examples
    --------

    .. code-block:: python

       >>> import tethne.readers as rd
       >>> P1 = rd.wos.read("/Path/to/data1.txt")
       >>> P2 = rd.dfr.read("/Path/to/DfR")
       >>> papers = rd.merge(P1, P2, ['ayjid'])
    """

    combined = []
    del_P1 = []
    del_P2 = []

    for x in xrange(len(P1)):
        p_1 = P1[x]
        for y in xrange(len(P2)):
            p_2 = P2[y]
            match = True
            for field in fields:
                if p_1[field] != p_2[field]:
                    match = False
                    break

            if match:  # Add values first from P2 paper, then from P1 paper.
                new_p = dt.Paper()
                for key, value in p_2.iteritems():
                    if value != '' and value != None:
                        new_p[key] = value
                for key, value in p_1.iteritems():
                    if value != '' and value != None:
                        new_p[key] = value

                del_P1.append(x)  # Flag for deletion.
                del_P2.append(y)

                combined.append(new_p)

    for x in xrange(len(P1)):
        if x not in del_P1:
            combined.append(P1[x])
    for x in xrange(len(P2)):
        if x not in del_P2:
            combined.append(P2[x])

    return combined
Beispiel #4
0
def direct_citation(papers, node_id='ayjid', node_attribs=['date'], **kwargs):
    """
    Create a traditional directed citation network.

    Direct-citation graphs are `directed acyclic graphs`__ in which vertices are
    papers, and each (directed) edge represents a citation of the target
    paper by the source paper. The :func:`.networks.papers.direct_citation`
    method generates both a global citation graph, which includes all cited and
    citing papers, and an internal citation graph that describes only citations
    among papers in the original dataset.

    .. _dag: http://en.wikipedia.org/wiki/Directed_acyclic_graph

    __ dag_

    To generate direct-citation graphs, use the
    :func:`.networks.papers.direct_citation` method. Note the size difference
    between the global and internal citation graphs.

    .. code-block:: python

       >>> gDC, iDC = nt.papers.direct_citation(papers)
       >>> len(gDC)
       5998
       >>> len(iDC)
       163

    ==============     =========================================================
    Element            Description
    ==============     =========================================================
    Node               Papers, represented by node_id.
    Edge               From a paper to a cited reference.
    Edge Attribute     Publication date of the citing paper.
    ==============     =========================================================

    Parameters
    ----------
    papers : list
        A list of :class:`.Paper` instances.

    node_id : int
        A key from :class:`.Paper` to identify the nodes. Default is 'ayjid'.

    node_attribs : list
        List of user provided optional arguments apart from the provided
        positional arguments.

    Returns
    -------
    citation_network : networkx.DiGraph
        Global citation network (all citations).
    citation_network_internal : networkx.DiGraph
        Internal citation network where only the papers in the list are nodes in
        the network.

    Raises
    ------
    KeyError : If node_id is not present in the meta_list.
    """
    citation_network = nx.DiGraph(type='citations')
    citation_network_internal = nx.DiGraph(type='citations')

    # Check node_id validity.
    meta_dict = ds.Paper()
    meta_keys = meta_dict.keys()
    if node_id not in meta_keys:
        raise KeyError('node_id:' + node_id + 'is not in the set of' +
                       'meta_keys')

    for entry in papers:
        # Check the head.
        head_has_id = True
        if entry[node_id] is None:
            head_has_id = False

        if head_has_id:
            # Then create node to both global and internal networks.
            node_attrib_dict = util.subdict(entry, node_attribs)
            citation_network.add_node(entry[node_id], node_attrib_dict)
            citation_network_internal.add_node(entry[node_id],
                                               node_attrib_dict)
        if entry['citations'] is not None:
            for citation in entry['citations']:
                # Check the tail.
                tail_has_id = True
                if citation[node_id] is None:
                    tail_has_id = False

                if tail_has_id:
                    # Then create node to global but not internal network.
                    node_attrib_dict = util.subdict(citation, node_attribs)
                    citation_network.add_node(citation[node_id],
                                              node_attrib_dict)

                if head_has_id and tail_has_id:
                    # Then draw an edge in the network.
                    citation_network.add_edge(entry[node_id],
                                              citation[node_id],
                                              date=entry['date'])

                    # And check if it can be added to the internal network, too.
                    if (util.contains (papers,
                                       lambda wos_obj:
                                       wos_obj[node_id] == citation[node_id])):
                        citation_network_internal.add_edge(
                            entry[node_id],
                            citation[node_id],
                            date=entry['date'])

    # Checking if both the graphs are Directed Acyclic Graphs.
    if not nx.is_directed_acyclic_graph(citation_network):
        raise nx.NetworkXError("Citation graph is not a DAG.")
    elif not nx.is_directed_acyclic_graph(citation_network_internal):
        raise nx.NetworkXError("Internal citation graph is not a DAG.")
    else:
        return citation_network, citation_network_internal
Beispiel #5
0
def bibliographic_coupling(papers, citation_id='ayjid', threshold=1,
                           node_id='ayjid', node_attribs=['date'],
                           weighted=False, **kwargs):
    """
    Generate a bibliographic coupling network.

    Two papers are **bibliographically coupled** when they both cite the same,
    third, paper. You can generate a bibliographic coupling network using the
    :func:`.networks.papers.bibliographic_coupling` method.

    .. code-block:: python

       >>> BC = nt.papers.bibliographic_coupling(papers)
       >>> BC
       <networkx.classes.graph.Graph object at 0x102eec710>

    Especially when working with large datasets, or disciplinarily narrow
    literatures, it is usually helpful to set a minimum number of shared
    citations required for two papers to be coupled. You can do this by setting
    the **`threshold`** parameter.

    .. code-block:: python

       >>> BC = nt.papers.bibliographic_coupling(papers, threshold=1)
       >>> len(BC.edges())
       1216
       >>> BC = nt.papers.bibliographic_coupling(papers, threshold=2)
       >>> len(BC.edges())
       542

    ===============    =========================================================
    Element            Description
    ===============    =========================================================
    Node               Papers represented by node_id.
    Node Attributes    node_attribs in :class:`.Paper`
    Edge               (a,b) in E(G) if a and b share x citations where x >=
                       threshold.
    Edge Attributes    overlap: the number of citations shared
    ===============    =========================================================


    Parameters
    ----------
    papers : list
        A list of wos_objects.
    citation_id: string
        A key from :class:`.Paper` to identify the citation overlaps.  Default
        is 'ayjid'.
    threshold : int
        Minimum number of shared citations to consider two papers "coupled".
    node_id : string
        Field in :class:`.Paper` used to identify the nodes. Default is 'ayjid'.
    node_attribs : list
        List of fields in :class:`.Paper` to include as node attributes in
        graph.
    weighted : bool
        If True, edge attribute `overlap` is a float in {0-1} calculated as
        :math:`\cfrac{N_{ij}}{\sqrt{N_{i}N_{j}}}` where :math:`N_{i}` and
        :math:`N_{j}` are the number of references in :class:`.Paper` *i* and
        *j*, respectively, and :math:`N_{ij}` is the number of references
        shared by papers *i* and *j*.

    Returns
    -------
    bcoupling : networkx.Graph
        A bibliographic coupling network.

    Raises
    ------
    KeyError : Raised when citation_id is not present in the meta_list.

    Notes
    -----
    Lists cannot be attributes? causing errors for both gexf and graphml also
    nodes cannot be none.
    """

    bcoupling = nx.Graph(type='biblio_coupling')

    # Validate identifiers.
    meta_dict = ds.Paper()
    meta_keys = meta_dict.keys()
    if node_id not in meta_keys:
        raise KeyError('node_id' + node_id + ' is not a meta_dict key.')

    # 'citations' is the only invalid meta_key for citation_id
    meta_keys.remove('citations')
    if citation_id not in meta_keys:
        raise KeyError('citation_id' + citation_id + ' is not a meta_dict' +
                       ' key or otherwise cannot be used to detect citation' +
                       ' overlap.')

    for i in xrange(len(papers)):
        # Make a list of citation_id's for each paper...
        i_list = []
        if papers[i]['citations'] is not None:
            for citation in papers[i]['citations']:
                i_list.append(citation[citation_id])

        # ...and construct that paper's node.
        node_i_attribs = util.subdict(papers[i], node_attribs)

        for j in xrange(i+1, len(papers)):
            # Make a list of citation_id's for each paper...
            j_list = []
            if papers[j]['citations'] is not None:
                for citation in papers[j]['citations']:
                    j_list.append(citation[citation_id])

            # ...and construct that paper's node.
            node_j_attribs = util.subdict(papers[j], node_attribs)

            # Add nodes and edge if the citation overlap is sufficiently high.
            overlap = util.overlap(i_list, j_list)

            if weighted:
                if len(overlap) > 0:
                    w = (float(len(i_list)) * float(len(j_list)))**0.5
                    similarity = float(len(overlap)) / w
                else:
                    similarity = 0
            else:
                similarity = len(overlap)

            if similarity >= threshold:
                bcoupling.add_node(papers[i][node_id], node_i_attribs)
                bcoupling.add_node(papers[j][node_id], node_j_attribs)
                #nx.set_node_attributes(bcoupling,"",node_i_attribs)

                bcoupling.add_edge(papers[i][node_id],
                                   papers[j][node_id],
                                   similarity=similarity)
    return bcoupling
Beispiel #6
0
def read(filepath):
    """
    Given a file with PubMed XML, return a list of :class:`.Paper` instances.

    See the following hyperlinks regarding possible structures of XML:
    * http://www.ncbi.nlm.nih.gov/pmc/pmcdoc/tagging-guidelines/citations/v2/citationtags.html#2Articlewithmorethan10authors%28listthefirst10andaddetal%29
    * http://dtd.nlm.nih.gov/publishing/

    Each :class:`.Paper` is tagged with an accession id for this
    read/conversion.

    **Usage**

    .. code-block:: python

       >>> import tethne.readers as rd
       >>> papers = rd.pubmed.read("/Path/to/PubMedData.xml")

    Parameters
    ----------
    filepath : string
        Path to PubMed XML file.

    Returns
    -------
    meta_list : list
        A list of :class:`.Paper` instances.
    """

    try:
        with open(filepath, 'r') as f:
            tree = ET.fromstring(text, parser)(filepath)
            root = tree.getroot()

    except IOError:  # File does not exist, or couldn't be read.
        raise IOError("File does not exist, or cannot be read.")

    accession = str(uuid.uuid4())

    # define location of simple article meta data relative to xml tree rooted
    # at 'article'
    meta_loc = {
        'atitle': './front/article-meta/title-group/article-title',
        'jtitle':
        ('./front/journal-meta/journal-title-group/' + 'journal-title'),
        'volume': './front/article-meta/volume',
        'issue': './front/article-meta/issue',
        'spage': './front/article-meta/fpage',
        'epage': './front/article-meta/lpage'
    }

    # location relative to element-citation element
    cit_meta_loc = {
        'atitle': './article-title',
        'jtitle': './source',
        'date': './year',
        'volume': './volume',
        'spage': './fpage',
        'epage': './epage'
    }

    meta_list = []
    for article in root.iter('article'):
        paper = ds.Paper()

        # collect information from the 'front' section of the article
        # collect the simple data
        for key in meta_loc.iterkeys():
            key_data = article.find(meta_loc[key])
            if key_data is not None:
                paper[key] = key_data.text
            else:
                paper[key] = None

        # collect doi and pmid
        id_list = article.findall('./front/article-meta/article-id')
        for identifier in id_list:
            id_type = identifier.get('pub-id-type')
            if id_type == 'doi':
                paper['doi'] = identifier.text
            elif id_type == 'pmid':
                paper['pmid'] = identifier.text
            else:
                # if never found, remain at None from initialization
                pass

        # collect aulast and auinint
        aulast = []
        auinit = []
        contribs = article.findall(
            './front/article-meta/contrib-group/contrib')
        # if contrib is not found then loop is skipped
        for contrib in contribs:
            contrib_type = contrib.get('contrib-type')
            if contrib_type == 'author':
                surname = contrib.find('./name/surname')
                if surname is not None:
                    # then it was found
                    aulast.append(surname.text)
                else:
                    aulast.append(None)

                # multiple given names? this takes first one
                given_name = contrib.find('./name/given-names')
                if given_name is not None:
                    # then it was found
                    auinit.append(given_name.text[0])
                else:
                    auinit.append(None)
        paper['aulast'] = aulast
        paper['auinit'] = auinit

        # collect date
        pub_dates = article.findall('./front/article-meta/pub-date')
        # if pub-date is not found then loop is skipped
        for pub_date in pub_dates:
            pub_type = pub_date.get('pub-type')
            print pub_type
            if pub_type == 'collection':
                year = pub_date.find('./year')
                if year is not None:
                    # then it was found
                    paper['date'] = year.text
                else:
                    paper['date'] = None

        meta_list.append(paper)

        # construct ayjid
        paper['ayjid'] = create_ayjid(**paper)  # THIS IS BROKEN.

        # citations
        citations_list = []

        # element-citation handling different from mixed-citation handling
        citations = article.findall('./back/ref-list/ref/element-citation')
        for cite in citations:
            cite_dict = ds.Paper()

            # simple meta data
            for key in cit_meta_loc.iterkeys():
                key_data = cite.find(cit_meta_loc[key])
                if key_data is not None:
                    paper[key] = key_data.text
                else:
                    paper[key] = None

            # doi and pmid
            pub_id = cite.find('./pub-id')
            if pub_id is not None:
                pub_id_type = pub_id.get('pub-id-type')
                if pub_id_type == 'doi':
                    cite_dict['doi'] = pub_id.text
                elif pub_id_type == 'pmid':
                    cite_dict['pmid'] = pub_id.text

            # aulast and auinit
            cite_aulast = []
            cite_auinit = []

            # determine if person group is authors
            person_group = cite.find('./person-group')
            if person_group is not None:
                group_type = person_group.get('person-group-type')
            else:
                group_type = None

            # then add the authors to the cite_dict
            if group_type == 'author':
                names = person_group.findall('./name')
                for name in names:
                    # add surname
                    surname = name.find('./surname')
                    if surname is not None:
                        # then it was found
                        cite_aulast.append(surname.text)
                    else:
                        cite_aulast.append(None)

                    # add given names
                    given_names = name.find('./given-names')
                    if given_names is not None:
                        # then it was found
                        cite_auinit.append(given_names.text[0])
                    else:
                        cite_auinit.append(None)

            if not cite_aulast:
                # then empty
                cite_aulast = None
            if not cite_auinit:
                # then empty
                cite_auinit = None

            cite_dict['aulast'] = cite_aulast
            cite_dict['auinit'] = cite_auinit

            citations_list.append(cite_dict)
        # end cite loop

        paper['citations'] = citations_list

        paper['accession'] = accession

        meta_list.append(paper)
    # end article loop

    return meta_list
Beispiel #7
0
def convert(wos_data):
    """
    Convert parsed field-tagged data to :class:`.Paper` instances.

    Convert a dictionary or list of dictionaries with keys from the
    Web of Science field tags into a :class:`.Paper` instance or list of
    :class:`.Paper` instances, the standard for Tethne.
    
    Each :class:`.Paper` is tagged with an accession id for this conversion.

    Parameters
    ----------
    wos_data : list
        A list of dictionaries with keys from the WoS field tags.

    Returns
    -------
    papers : list
        A list of :class:`.Paper` instances.

    Examples
    --------

    .. code-block:: python

       >>> import tethne.readers as rd
       >>> wos_list = rd.wos.parse("/Path/to/data.txt")
       >>> papers = rd.wos.convert(wos_list)

    Notes
    -----
    Need to handle author name anomolies (case, blank spaces, etc.) that may
    make the same author appear to be two different authors in Networkx; this is
    important for any graph with authors as nodes.

    """

    accession = str(uuid.uuid4())

    #create a Paper for each wos_dict and append to this list
    papers = []

    #handle dict inputs by converting to a 1-item list
    if type(wos_data) is dict:
        wos_data = [wos_data]
        #print 'wos data \n' , wos_data

    # Calling the validate function here, before even building papers list
    # [62809724]
    status = _validate(wos_data)
    if not status:
        #raise Error
        pass

    # Define the direct relationships between WoS fieldtags and Paper keys.
    translator = _wos2paper_map()

    # Perform the key convertions
    for wos_dict in wos_data:
        paper = ds.Paper()

        #direct translations
        for key in translator.iterkeys():
            paper[translator[key]] = wos_dict[key]

        # Group authors ('CA') are treated as personal authors.
        if 'CA' in wos_dict.keys():
            try:
                wos_dict['AU'] += wos_dict['CA']
            except TypeError:
                wos_dict['AU'] = wos_dict['CA']
            try:
                wos_dict['AF'] += wos_dict['CA']
            except KeyError:
                wos_dict['AF'] = wos_dict['CA']

        # more complicated translations
        # FIXME: not robust to all names, organziation authors, etc.
        if wos_dict['AU'] is not None:
            paper['aulast'], paper['auinit'] = _handle_authors(wos_dict)

        #construct ayjid
        ayjid = _create_ayjid(paper['aulast'], paper['auinit'], paper['date'],
                              paper['jtitle'])
        paper['ayjid'] = ayjid

        # Parse author-institution affiliations. #60216226, #57746858.
        if wos_dict['C1'] is not None:
            paper['institutions'] = _handle_author_institutions(wos_dict)

        # Convert CR references into paper format
        if wos_dict['CR'] is not None:
            meta_cr_list = []
            for ref in wos_dict['CR']:
                meta_cr_list.append(_parse_cr(ref))
                #print 'meta_cr_list' , meta_cr_list
            paper['citations'] = meta_cr_list

        paper['accession'] = accession

        papers.append(paper)
    # End wos_dict for loop.

    return papers
Beispiel #8
0
def _parse_institutions(ref):
    """
    Supports the Web of Science reader by converting the strings found at the C1
    fieldtag of a record into a minimum :class:`.Paper` instance.

    Parameters
    ----------
    ref : str
        'C1' field tag data from a plain text Web of Science file which contains
        Author First and Last names, Institution affiliated, and the
        location/city where they are affiliated to.

    Returns
    -------
    addr_dict : :class:`.Paper`
        A :class:`.Paper` instance.

    Raises
    ------
    IndexError
        When input 'ref' has less number of tokens than necessary ones.

    ValueError
        Gets input with mismacthed inputtype. Ex: getting no numbers for a date
        field.

    Notes
    -----
    Needs to check many test cases to check various input types.

    """
    addr_dict = ds.Paper()
    #tokens of form:
    tokens = ref.split(',')

    try:

        name = tokens[0]
        name_tokens = name.split(' ')
        addr_dict['aulast'] = name_tokens[0]
        addr_dict['auinit'] = name_tokens[1]

        #strip initial characters based on the field (spaces, 'V', 'DOI')
        addr_dict['addr2'] = tokens[1][1:]
        addr_dict['addr3'] = tokens[2][1:]
        addr_dict['country'] = tokens[3][2:]

    except IndexError:
        #ref did not have the full set of tokens
        pass
    except ValueError:
        #this occurs when the program expects a date but gets a string with
        #no numbers, we leave the field incomplete because chances are
        #the CR string is too sparse to use anyway
        pass

    auinsid = _create_ayjid(addr_dict['aulast'], addr_dict['auinit'],
                            addr_dict['date'], addr_dict['jtitle'])
    addr_dict['auinsid'] = auinsid

    return addr_dict
Beispiel #9
0
def _parse_cr(ref):
    """
    Supports the Web of Science reader by converting the strings found
    at the CR field tag of a record into a minimum :class:`.Paper` instance.

    Parameters
    ----------
    ref : str
        CR field tag data from a plain text Web of Science file.

    Returns
    -------
    paper : :class:`.Paper`
        A :class:`.Paper` instance.

    Raises
    ------
    IndexError
        When input 'ref' has less number of tokens than necessary ones.
    ValueError
        Gets input with mismacthed inputtype. Ex: getting no numbers for a date
        field.

    Notes
    -----
    Needs a sophisticated name parser, would like to use an open source resource
    for this.

    If WoS is missing a field in the middle of the list there are NOT commas
    indicating that; the following example does NOT occur:

        Doe J, ,, Some Journal

    instead

        Doe J, Some Journal

    This threatens the integrity of WoS data; should we address it?

    Another threat: if WoS is unsure of the DOI number there will be multiple
    DOI numbers in a list of form [doi1, doi2, ...], address this?

    """
    paper = ds.Paper()
    #tokens of form: aulast auinit, date, jtitle, volume, spage, doi
    tokens = ref.split(',')
    try:
        #FIXME: needs better name parser
        # Checking for few parsers, in the meantime trying out few things.
        name = tokens[0]
        # Temp Solution for #62809724
        pattern = re.compile(r'\[(.*?)\]')
        match = pattern.search(name)
        if match:
            # remove the [] and make it a proper one.
            name = name[match.start() + 1:match.end() - 1]
            if DEBUG:
                print 'stripped name: ', name

        name_tokens = name.split(' ')
        if len(name_tokens) < 2:
            # name_tokens.append('None')
            name_tokens.append(' ')

        paper['aulast'] = [name_tokens[0]]
        paper['auinit'] = [''.join(name_tokens[1:]).replace('.', '')]

        if DEBUG:
            print "Final Meta Dicts", paper['aulast'], paper['auinit']

        # Temp Solution for #62809724
        if paper['auinit'] == 'None' or paper['aulast'] == 'None':
            raise ("The Cited References field is not in the expeceted format")

        #strip initial characters based on the field (spaces, 'V', 'DOI')
        paper['date'] = int(tokens[1][1:])
        paper['jtitle'] = tokens[2][1:]
        paper['volume'] = tokens[3][2:]
        paper['spage'] = tokens[4][2:]
        paper['doi'] = tokens[5][5:]
    except IndexError as E:  # ref did not have the full set of tokens
        pass
    except ValueError as E:  # This occurs when the program expects a date
        pass  #  but gets a string with no numbers. We leave
        #  the field incomplete because chances are the
        #  CR string is too sparse to use anyway.

    ayjid = _create_ayjid(paper['aulast'], paper['auinit'], paper['date'],
                          paper['jtitle'])
    paper['ayjid'] = ayjid

    return paper