def tags(record):
    record = b.customization.convert_to_unicode(record)
    record = c.author(record)
    record = c.editor(record)
    tags = set()

    if 'tags' in record:
        tags.update([
            i.strip()
            for i in re.split(',|;', record["tags"].replace('\n', ''))
        ])

    record['tags'] = tags
    record['p_authors'] = []
    logging.debug(f"Handling: {record['ID']}")
    if 'author' in record:
        try:
            record['p_authors'] = [
                c.splitname(x, False) for x in record['author']
            ]
        except Exception as err:
            breakpoint()
    if 'editor' in record:
        record['p_authors'] = [c.splitname(x, False) for x in record['editor']]

    return record
 def test_splitname_cases(self):
     """Test customization.splitname() vs output from BibTeX """
     for name, expected in splitname_test_cases:
         result = splitname(name)
         self.assertEqual(result,
                          expected,
                          msg="Input name: {0}".format(name))
Exemple #3
0
def format_author_list(authors):
    tidy_authors = []
    for i_author in authors:
        i_author = cleaner.clean_braces(i_author)
        author_name_parts = splitname(i_author, strict_mode=False)
        formatted_author = format_author_name(author_name_parts)

        tidy_authors.append(formatted_author)

    return tidy_authors
Exemple #4
0
def clean_name(author):
    if type(author) == dict:
        if "name" in author:
            out_author = clean_text(author["name"].strip())
            out_author = enclose_braces(out_author)
            return out_author
        if "given" in author and "family" in author:
            given_name = author["given"]
            family_name = author["family"]
            full_name = given_name + " " + family_name
            name_parts = splitname(full_name)
        elif "given" in author:
            full_name = author["given"]
            print("Check me: " + full_name)
            name_parts = splitname(full_name)
        elif "family" in author:
            full_name = author["family"]
            print("Check me: " + full_name)
            name_parts = splitname(full_name)
        else:
            print(author)
            out_author = clean_text(author["name"].strip())
            out_author = enclose_braces(out_author)
            return out_author
    elif type(author) == str:
        institute_author, is_institute = clean_institute_author(author)
        if is_institute:
            return institute_author

        author = clean_braces(author)
        name_parts = splitname(author)
    else:
        raise Exception("Unknown author type!")

    out_author = formatter.format_author_name(name_parts)

    return out_author
Exemple #5
0
def split_authors_name(
        authors: List[str], separator: str = "and") -> List[Dict[str, str]]:
    """
    Convert a list of authors to papis formatted data.

    :arg authors: A list of single author names or multiple authors separated
        by *separator*.
    """
    from bibtexparser.customization import splitname

    author_list = []
    for subauthors in authors:
        for author in re.split(r"\s+{}\s+".format(separator), subauthors):
            parts = splitname(author)
            given = " ".join(parts["first"])
            family = " ".join(parts["von"] + parts["last"] + parts["jr"])

            author_list.append(dict(family=family, given=given))

    return author_list
def handle_authors(entry: dict, ) -> dict:
    """
    Sets 'author' and 'editor' each to:
        1) list of dicts of lists
        (each author passed through bibtexparser.customization.splitname)
        2) None, if 'author'/'editor' not in item_

    Example:
        item_['author'] = [
                {
                    'first': ['J.', 'L.'],
                    'last': 'Bredas',
                    'von': '',
                    'jr': '',
                },
                {
                    'first': ['Georg', 'Henrik'],
                    'last': 'Wright',
                    'von': 'von',
                    'jr': '',
                },
            ]
    :param entry: item_-dict
    :return: item_-dict with formatted author
    """
    for field in 'author editor'.split():
        if field in entry:
            authors = entry[field].split(' and ')
            authors_ = []
            for au in authors:
                au_dict = bib_custom.splitname(au)
                au_dict_new = {
                    k: (v[0] if v else '')
                    for k, v in au_dict.items()
                }
                au_dict_new['first'] = au_dict['first']
                authors_.append(au_dict_new)
            entry[field] = authors_
        else:
            entry[field] = None
    return entry
def custom(record):
    record = c.type(record)
    record = c.author(record)
    record = c.editor(record)
    record = c.journal(record)
    record = c.keyword(record)
    record = c.link(record)
    record = c.doi(record)
    tags = set()

    if 'tags' in record:
        tags.update([i.strip() for i in re.split(',|;', record["tags"].replace('\n', ''))])
    if "keywords" in record:
        tags.update([i.strip() for i in re.split(',|;', record["keywords"].replace('\n', ''))])
    if "mendeley-tags" in record:
        tags.update([i.strip() for i in re.split(',|;', record["mendeley-tags"].replace('\n', ''))])

    record['tags'] = tags
    record['p_authors'] = []
    if 'author' in record:
        record['p_authors'] = [c.splitname(x, False) for x in record['author']]
    return record
def clean_full(record):
    record = c.type(record)
    record = c.author(record)
    record = c.editor(record)
    record = c.journal(record)
    record = c.keyword(record)
    record = c.link(record)
    record = c.doi(record)
    tags = set()

    if 'tags' in record:
        tags.update([
            i.strip()
            for i in re.split(',|;', record["tags"].replace('\n', ''))
        ])
    if "keywords" in record:
        tags.update([
            i.strip()
            for i in re.split(',|;', record["keywords"].replace('\n', ''))
        ])
    if "mendeley-tags" in record:
        tags.update([
            i.strip()
            for i in re.split(',|;', record["mendeley-tags"].replace('\n', ''))
        ])

    record['tags'] = tags
    record['p_authors'] = []

    if 'author' in record:
        record['p_authors'] += [x.split(' and ') for x in record['author']]

    if 'editor' in record:
        record['p_authors'] += [
            c.splitname(x, False) for x in record['editor']
        ]

    return record
def custom(record):
    try:
        record = c.convert_to_unicode(record)
    except TypeError as e:
        logging.warning("Unicode Error on: {}".format(record['ID']))
        record['error'] = 'unicode'

    try:
        #add md5 of associated files
        files = [add_slash_if_necessary(y) for x in record['file'].split(';') for y in x.split(':') if bool(y.strip()) and y.strip().lower() != 'pdf']
        file_set = set(files)
        if not 'hashes' in record:
            hashes = [file_to_hash(x) for x in file_set]
            record['hashes'] = ";".join(hashes)
            #regularize format of files list
            record['file'] = ";".join(file_set)
    except Exception as e:
        logging.warning("File Error: {} : {}".format(record['ID'], e.args[0]))
        record['error'] = 'file'

    #todo: if file is not in the library common prefix, move it there
    #look for year, then first surname, then copy in, making dir if necessary
    if file_set:
        for x in file_set:
            try:
                current_path = realpath(x)
                common = commonpath([current_path, args.library])
                if common != args.library:
                    logging.info("Found file outside library: {}".format(current_path))
                    logging.info("Common: {}".format(common))
                    #get the author and year
                    year = record['year']
                    authors = c.getnames([i.strip() for i in record["author"].replace('\n', ' ').split(" and ")])
                    authors_split = [c.splitname(a) for a in authors]
                    author_surnames = [a['last'][0] for a in authors_split]
                    new_path = join(args.library, year, ", ".join(author_surnames))
                    logging.info("New Path: {}".format(new_path))
                    #create directory if necessary
                    #copy file
                    full_new_path = join(new_path, split(current_path)[1])
                    logging.info("Copying file")
                    logging.info("From: {}".format(current_path))
                    logging.info("To: {}".format(full_new_path))
                    response = input("Enter to confirm: ")
                    if response == "":
                        logging.info("Proceeding")
                        if not exists(new_path):
                            mkdir(new_path)
                        if exists(full_new_path):
                            raise Exception("File already exists")
                        copyfile(x, full_new_path)
                        file_set.remove(x)
                        file_set.add(full_new_path)
                        record['file'] = ";".join(file_set)
            except Exception as e:
                logging.info("Issue copying file for: {}".format(x))
                logging.info(e)
                record['error'] = 'file_copy'


    #regularize keywords
    try:
        keywords = set()
        if 'tags' not in record:
            if 'keywords' in record:
                keywords.update([x.strip() for x in record['keywords'].split(',')])
                del record['keywords']
            if 'mendeley-tags' in record:
                keywords.update([x.strip() for x in record['mendeley-tags'].split(',')])
                del record['mendeley-tags']

            record['tags'] = ",".join(keywords)
    except Error as e:
        logging.warning("Tag Error: {}".format(record['ID']))
        record['error'] = 'tag'

    # record = c.type(record)
    # record = c.author(record)
    # record = c.editor(record)
    # record = c.journal(record)
    # record = c.keyword(record)
    # record = c.link(record)
    # record = c.doi(record)
    # record['p_authors'] = []
    # if 'author' in record:
    #     record['p_authors'] = [c.splitname(x, False) for x in record['author']]
    return record
    def test_splitname_basic(self):
        """Basic tests of customization.splitname() """
        # Empty input.
        result = splitname("")
        expected = {}
        self.assertEqual(result, expected, msg="Invalid output for empty name")

        # Non-whitespace names.
        result = splitname("    ")
        expected = {}
        self.assertEqual(result,
                         expected,
                         msg="Invalid output for space-only name")
        result = splitname("  \t~~")
        expected = {}
        self.assertEqual(result,
                         expected,
                         msg="Invalid output for whitespace name")

        # Test strict mode.
        with self.assertRaises(InvalidName):  # Trailing comma (4 cases).
            splitname("BB,", strict_mode=True)
        with self.assertRaises(InvalidName):
            splitname("BB,  ", strict_mode=True)
        with self.assertRaises(InvalidName):
            splitname("BB, ~\t", strict_mode=True)
        with self.assertRaises(InvalidName):
            splitname(", ~\t", strict_mode=True)
        with self.assertRaises(InvalidName):  # Too many sections.
            splitname("AA, BB, CC, DD", strict_mode=True)
        with self.assertRaises(
                InvalidName):  # Unterminated opening brace (x3).
            splitname("AA {BB CC", strict_mode=True)
        with self.assertRaises(InvalidName):
            splitname("AA {{{BB CC", strict_mode=True)
        with self.assertRaises(InvalidName):
            splitname("AA {{{BB} CC}", strict_mode=True)
        with self.assertRaises(InvalidName):  # Unmatched closing brace (x3).
            splitname("AA BB CC}", strict_mode=True)
        with self.assertRaises(InvalidName):
            splitname("AA BB CC}}}", strict_mode=True)
        with self.assertRaises(InvalidName):
            splitname("{AA {BB CC}}}", strict_mode=True)

        # Test strict mode off for trailing comma.
        expected = {'first': [], 'von': [], 'last': ["BB"], 'jr': []}
        result = splitname("BB,", strict_mode=False)
        self.assertEqual(
            result,
            expected,
            msg="Invalid output for trailing comma with strict mode off")
        result = splitname("BB,   ", strict_mode=False)
        self.assertEqual(
            result,
            expected,
            msg="Invalid output for trailing comma with strict mode off")
        result = splitname("BB,  ~\t ", strict_mode=False)
        self.assertEqual(
            result,
            expected,
            msg="Invalid output for trailing comma with strict mode off")
        expected = {}
        result = splitname(",  ~\t", strict_mode=False)
        self.assertEqual(
            result,
            expected,
            msg="Invalid output for trailing comma with strict mode off")

        # Test strict mode off for too many sections.
        expected = {
            'first': ["CC", "DD"],
            'von': [],
            'last': ["AA"],
            'jr': ["BB"]
        }
        result = splitname("AA, BB, CC, DD", strict_mode=False)
        self.assertEqual(
            result,
            expected,
            msg="Invalid output for too many sections with strict mode off")

        # Test strict mode off for an unterminated opening brace.
        result = splitname("AA {BB CC", strict_mode=False)
        expected = {'first': ["AA"], 'von': [], 'last': ["{BB CC}"], 'jr': []}
        self.assertEqual(
            result,
            expected,
            msg=
            "Invalid output for unterminated opening brace with strict mode off"
        )
        result = splitname("AA {{{BB CC", strict_mode=False)
        expected = {
            'first': ["AA"],
            'von': [],
            'last': ["{{{BB CC}}}"],
            'jr': []
        }
        self.assertEqual(
            result,
            expected,
            msg=
            "Invalid output for unterminated opening brace with strict mode off"
        )
        result = splitname("AA {{{BB} CC}", strict_mode=False)
        expected = {
            'first': ["AA"],
            'von': [],
            'last': ["{{{BB} CC}}"],
            'jr': []
        }
        self.assertEqual(
            result,
            expected,
            msg=
            "Invalid output for unterminated opening brace with strict mode off"
        )

        # Test strict mode off for an unmatched closing brace.
        result = splitname("AA BB CC}", strict_mode=False)
        expected = {
            'first': ["AA", "BB"],
            'von': [],
            'last': ["{CC}"],
            'jr': []
        }
        self.assertEqual(
            result,
            expected,
            msg=
            "Invalid output for unmatched closing brace with strict mode off")
        result = splitname("AA BB CC}}}", strict_mode=False)
        expected = {
            'first': ["AA", "BB"],
            'von': [],
            'last': ["{{{CC}}}"],
            'jr': []
        }
        self.assertEqual(
            result,
            expected,
            msg=
            "Invalid output for unmatched closing brace with strict mode off")
        result = splitname("{AA {BB CC}}}", strict_mode=False)
        expected = {
            'first': [],
            'von': [],
            'last': ["{{AA {BB CC}}}"],
            'jr': []
        }
        self.assertEqual(
            result,
            expected,
            msg=
            "Invalid output for unmatched closing brace with strict mode off")

        # Test it handles commas at higher brace levels.
        result = splitname("CC, dd, {AA, BB}")
        expected = {
            'first': ["{AA, BB}"],
            'von': [],
            'last': ["CC"],
            'jr': ["dd"]
        }
        self.assertEqual(result,
                         expected,
                         msg="Invalid output for braced commas")
 def test_splitname_cases(self):
     """Test customization.splitname() vs output from BibTeX """
     for name, expected in splitname_test_cases:
         result = splitname(name)
         self.assertEqual(result, expected, msg="Input name: {0}".format(name))
    def test_splitname_basic(self):
        """Basic tests of customization.splitname() """
        # Empty input.
        result = splitname("")
        expected = {}
        self.assertEqual(result, expected, msg="Invalid output for empty name")

        # Non-whitespace names.
        result = splitname("    ")
        expected = {}
        self.assertEqual(result, expected, msg="Invalid output for space-only name")
        result = splitname("  \t~~")
        expected = {}
        self.assertEqual(result, expected, msg="Invalid output for whitespace name")

        # Test strict mode.
        with self.assertRaises(InvalidName):         # Trailing comma (4 cases).
            splitname("BB,", strict_mode=True)
        with self.assertRaises(InvalidName):
            splitname("BB,  ", strict_mode=True)
        with self.assertRaises(InvalidName):
            splitname("BB, ~\t", strict_mode=True)
        with self.assertRaises(InvalidName):
            splitname(", ~\t", strict_mode=True)
        with self.assertRaises(InvalidName):         # Too many sections.
            splitname("AA, BB, CC, DD", strict_mode=True)
        with self.assertRaises(InvalidName):         # Unterminated opening brace (x3).
            splitname("AA {BB CC", strict_mode=True)
        with self.assertRaises(InvalidName):
            splitname("AA {{{BB CC", strict_mode=True)
        with self.assertRaises(InvalidName):
            splitname("AA {{{BB} CC}", strict_mode=True)
        with self.assertRaises(InvalidName):         # Unmatched closing brace (x3).
            splitname("AA BB CC}", strict_mode=True)
        with self.assertRaises(InvalidName):
            splitname("AA BB CC}}}", strict_mode=True)
        with self.assertRaises(InvalidName):
            splitname("{AA {BB CC}}}", strict_mode=True)

        # Test strict mode off for trailing comma.
        expected = {'first': [], 'von': [], 'last': ["BB"], 'jr': []}
        result = splitname("BB,", strict_mode=False)
        self.assertEqual(result, expected, msg="Invalid output for trailing comma with strict mode off")
        result = splitname("BB,   ", strict_mode=False)
        self.assertEqual(result, expected, msg="Invalid output for trailing comma with strict mode off")
        result = splitname("BB,  ~\t ", strict_mode=False)
        self.assertEqual(result, expected, msg="Invalid output for trailing comma with strict mode off")
        expected = {}
        result = splitname(",  ~\t", strict_mode=False)
        self.assertEqual(result, expected, msg="Invalid output for trailing comma with strict mode off")

        # Test strict mode off for too many sections.
        expected = {'first': ["CC", "DD"], 'von': [], 'last': ["AA"], 'jr': ["BB"]}
        result = splitname("AA, BB, CC, DD", strict_mode=False)
        self.assertEqual(result, expected, msg="Invalid output for too many sections with strict mode off")

        # Test strict mode off for an unterminated opening brace.
        result = splitname("AA {BB CC", strict_mode=False)
        expected = {'first': ["AA"], 'von': [], 'last': ["{BB CC}"], 'jr': []}
        self.assertEqual(result, expected, msg="Invalid output for unterminated opening brace with strict mode off")
        result = splitname("AA {{{BB CC", strict_mode=False)
        expected = {'first': ["AA"], 'von': [], 'last': ["{{{BB CC}}}"], 'jr': []}
        self.assertEqual(result, expected, msg="Invalid output for unterminated opening brace with strict mode off")
        result = splitname("AA {{{BB} CC}", strict_mode=False)
        expected = {'first': ["AA"], 'von': [], 'last': ["{{{BB} CC}}"], 'jr': []}
        self.assertEqual(result, expected, msg="Invalid output for unterminated opening brace with strict mode off")

        # Test strict mode off for an unmatched closing brace.
        result = splitname("AA BB CC}", strict_mode=False)
        expected = {'first': ["AA", "BB"], 'von': [], 'last': ["{CC}"], 'jr': []}
        self.assertEqual(result, expected, msg="Invalid output for unmatched closing brace with strict mode off")
        result = splitname("AA BB CC}}}", strict_mode=False)
        expected = {'first': ["AA", "BB"], 'von': [], 'last': ["{{{CC}}}"], 'jr': []}
        self.assertEqual(result, expected, msg="Invalid output for unmatched closing brace with strict mode off")
        result = splitname("{AA {BB CC}}}", strict_mode=False)
        expected = {'first': [], 'von': [], 'last': ["{{AA {BB CC}}}"], 'jr': []}
        self.assertEqual(result, expected, msg="Invalid output for unmatched closing brace with strict mode off")

        # Test it handles commas at higher brace levels.
        result = splitname("CC, dd, {AA, BB}")
        expected = {'first': ["{AA, BB}"], 'von': [], 'last': ["CC"], 'jr': ["dd"]}
        self.assertEqual(result, expected, msg="Invalid output for braced commas")
Exemple #13
0
def get_doi(entry, config):
    has_doi = bib_parser.has_doi(entry)
    my_etiquette = Etiquette(constants.PROJECT_NAME, constants.VERSION,
                             constants.URL, constants.EMAIL)
    max_levenshtein_distance = config.get_max_levenshtein_distance()
    update_URL = config.get_update_URL()

    works = Works(etiquette=my_etiquette)

    if not has_doi and bib_parser.has_url(entry):
        entry_url = bib_parser.get_url(entry)
        if "doi" in entry_url:
            doi = cleaner.clean_doi(entry_url)

            if is_crossref_work(doi):
                crossref_info = works.doi(doi)
                if crossref_is_similar(crossref_info, entry,
                                       max_levenshtein_distance):
                    entry = set_doi(entry, doi, update_URL)
                    has_doi = True

    if not has_doi:
        # we try to find the doi for the title
        entry_title = bib_parser.get_title(entry)
        entry_title = cleaner.clean_braces(entry_title)
        author = bib_parser.get_author(entry)
        first_author = splitname(author[0], strict_mode=False)
        first_author_last_name = first_author["last"][0]

        query_parameters = {
            "author": first_author_last_name,
            "bibliographic": entry_title
        }

        works_query = works.query(**query_parameters)
        works_query = works_query.sort("score").order("desc").select(
            ["title", "DOI"])
        i_i_item = 0
        max_items = min(works_query.count(), 10)
        works_results = iter(works_query)
        while i_i_item < max_items and not has_doi:
            i_item = next(works_results)
            if crossref_is_similar(i_item, entry, max_levenshtein_distance):
                doi = cr_parser.get_doi(i_item)
                entry = set_doi(entry, doi, update_URL)
                has_doi = True
            i_i_item += 1
    else:
        # We check to see if the doi is correct
        doi = bib_parser.get_doi(entry)
        doi = cleaner.clean_doi(doi)
        if is_crossref_work(doi):
            crossref_info = works.doi(doi)

            if crossref_is_similar(crossref_info, entry,
                                   max_levenshtein_distance):
                entry = set_doi(entry, doi, update_URL)
            else:
                entry.pop("doi", None)
                if "doi" in bib_parser.get_url(entry):
                    entry.pop("url", None)
                has_doi = False

        else:
            entry = set_doi(entry, doi, update_URL)

    return entry, has_doi
def format_authors(entry, abbreviate_first=True, et_al_at=1000):
    """
        this is the way i like it, tweak as needed.
    """

    # Split author field into a list of “Name, Surname”. seems to be inplace,
    # thats why we copy first
    r = entry.copy()
    btxc.author(r)
    names = r["author"]
    authors = []

    for name in names:
        # {'first': ['F.', 'Paul'], 'last': ['Spitzner'], 'von': [], 'jr': []}
        split = btxc.splitname(name)
        # print(split)
        if not abbreviate_first:
            first = " ".join(split["first"])
        else:
            first = ""
            for f in split["first"]:
                # name spelled out
                if len(f) > 2:
                    first += f[0] + "."
                elif f[1] in ".:;":
                    first += f[0] + "."
                else:
                    print(
                        f"Adapt the `format_authors` script to your needs for entry {r['ID']}"
                    )

        last = " ".join(split["last"])
        von = " ".join(split["von"])
        jr = " ".join(split["jr"])

        # stitch the name together and fix capitalziation
        temp = first.title()
        if len(von) > 0:
            temp += " " + von.lower()
        temp += " " + last  # do not title case this, breaks e.g. "de Heuvel"
        if len(jr) > 0:
            temp += " " + jr.lower()

        authors.append(temp)

    res = ""
    # now we have a list of authors nicely formatted, make this a readable
    # one-liner for the webiste
    if len(authors) > et_al_at:
        res = authors[0] + " et al."
    elif len(authors) == 1:
        res = authors[0]
    else:
        res = authors[0]
        for a in authors[1:-1]:
            res += ", " + a
        res += " and " + authors[-1]

    # cleanup bibtex brackets
    res = cleanup(res)
    # res = res.replace("{", "")
    # res = res.replace("}", "")
    return res
Exemple #15
0
def safe_splitname(s):
    s = s.strip()
    if s.endswith(","):
        s = s[:-1]
    return c.splitname(s)
Exemple #16
0
    def sort(self):
        self._data = OrderedDict(
            sorted(self._data.items(), key=lambda key_val: key_val[1]))

    def _handle_duplicates(self):
        self.sort()

        # delete duplicates
        duplicates = []
        last_entry = object()
        for k, entry in self._data.items():
            if entry == last_entry:
                duplicates.append(k)
            last_entry = entry
        for k in duplicates:
            self._data.pop(k, None)

        # set numbers/letters for pseudo-duplicates
        for k, v in self.unique_authors_years.items():
            if 1 < len(v):
                for i, id_ in enumerate(v):
                    self._data[id_]['letter_number'] = i + 1
            else:
                id_ = v[0]
                self._data[id_]['letter_number'] = None


if __name__ == '__main__':
    n = "Orti, E. and Bredas, J. L. and Clarisse, C.".split(' and ')
    print(bib_custom.splitname("von Wright, Georg Henrik"))