Python normalize_name Examples, beard.utils.normalize_name Python Examples

Example #1

0

Show file

File: models.py Project: lucianovilasboas/inspire-disambiguation

    def load_data(self, input_filename):
        """Loads ethnicity data from file

        Args:
            input_filename (str): Path to data file

        Example:
            RACE,NAMELAST,NAMEFRST
            1,SHERIDAN,CHARLES B
            2,TAYLOR,HERDSON
            3,JOHNSON,LUCY A
        """
        ethnicities, lasts, firsts = [], [], []
        with open(input_filename, "r") as fd:
            reader = csv.DictReader(fd)
            for row in reader:
                ethnicities.append(int(row["RACE"]))
                lasts.append(row["NAMELAST"])
                firsts.append(row["NAMEFRST"])

        names = [
            "%s, %s" % (last, first) for last, first in zip(lasts, firsts)
        ]
        normalized_names = [normalize_name(name) for name in names]

        self.X = normalized_names
        self.y = ethnicities

Example #2

0

Show file

File: blocking_funcs.py Project: OPersian/beard

    def last_name_first_initial(name):
        names = normalize_name(name).split(" ", 1)

        try:
            name = "%s %s" % (names[0], names[1].strip()[0])
        except IndexError:
            name = names[0]

        return name

Example #3

0

Show file

    def last_name_first_initial(name):
        names = normalize_name(name).split(" ", 1)

        try:
            name = "%s %s" % (names[0], names[1].strip()[0])
        except IndexError:
            name = names[0]

        return name

Example #4

0

Show file

File: helpers.py Project: benjamin-bergia/inspire-disambiguation

def get_author_full_name(signature):
    """Get author_name normalized.

    Args:
        signature (Signature): Signature object

    Returns:
        str: Normalized `signature.author_name` or empty string if None

    """
    return normalize_name(signature.author_name)

Example #5

0

Show file

File: helpers.py Project: benjamin-bergia/inspire-disambiguation

def get_normalized_affiliation(signature):
    """Get author_affiliations normalized.

    Args:
        signature (Signature): Signature object

    Returns:
        str: Normalized `signature.author_affiliation` or empty string if None

    """
    author_affiliation = signature.author_affiliation
    return normalize_name(author_affiliation) if author_affiliation else ""

Example #6

0

Show file

File: helpers.py Project: benjamin-bergia/inspire-disambiguation

def get_author_other_names(signature):
    """Get other names of author normalized.

    Args:
        signature (Signature): Signature object

    Returns:
        str: Normalized other names of author

    """
    author_name = signature.author_name
    other_names = author_name.split(",", 1)
    return normalize_name(other_names[1]) if len(other_names) == 2 else ""

Example #7

0

Show file

def affinity(X):
    """Compute pairwise distances between (author, affiliation) tuples.

    Note that this function is a heuristic. It should ideally be replaced
    by a more robust distance function, e.g. using a model learned over
    pairs of tuples.
    """
    distances = np.zeros((len(X), len(X)), dtype=np.float)

    for i, j in zip(*np.triu_indices(len(X), k=1)):
        name_i = normalize_name(X[i, 0])
        aff_i = X[i, 1]
        initials_i = name_initials(name_i)
        name_j = normalize_name(X[j, 0])
        aff_j = X[j, 1]
        initials_j = name_initials(name_j)

        # Names and affiliations match
        if name_i == name_j and aff_i == aff_j:
            distances[i, j] = 0.0

        # Compatible initials and affiliations match
        elif (len(initials_i | initials_j) == max(len(initials_i),
                                                  len(initials_j))
              and aff_i == aff_j and aff_i != ""):
            distances[i, j] = 0.0

        # Initials are not compatible
        elif (len(initials_i | initials_j) != max(len(initials_i),
                                                  len(initials_j))):
            distances[i, j] = 1.0

        # We dont know
        else:
            distances[i, j] = 0.5

    distances += distances.T
    return distances

Example #8

0

Show file

File: author_disambiguation.py Project: MSusik/beard

def affinity(X):
    """Compute pairwise distances between (author, affiliation) tuples.

    Note that this function is a heuristic. It should ideally be replaced
    by a more robust distance function, e.g. using a model learned over
    pairs of tuples.
    """
    distances = np.zeros((len(X), len(X)), dtype=np.float)

    for i, j in zip(*np.triu_indices(len(X), k=1)):
        name_i = normalize_name(X[i, 0])
        aff_i = X[i, 1]
        initials_i = name_initials(name_i)
        name_j = normalize_name(X[j, 0])
        aff_j = X[j, 1]
        initials_j = name_initials(name_j)

        # Names and affiliations match
        if (name_i == name_j and aff_i == aff_j):
            distances[i, j] = 0.0

        # Compatible initials and affiliations match
        elif (len(initials_i | initials_j) == max(len(initials_i),
                                                  len(initials_j)) and
              aff_i == aff_j and aff_i != ""):
            distances[i, j] = 0.0

        # Initials are not compatible
        elif (len(initials_i | initials_j) != max(len(initials_i),
                                                  len(initials_j))):
            distances[i, j] = 1.0

        # We dont know
        else:
            distances[i, j] = 0.5

    distances += distances.T
    return distances

Example #9

0

Show file

File: models.py Project: harunurhan/inspire-next

    def load_data(self, input_filename):
        ethnicities, lasts, firsts = [], [], []
        with open(input_filename, 'r') as fd:
            reader = csv.DictReader(fd)
            for row in reader:
                ethnicities.append(int(row['RACE']))
                lasts.append(row['NAMELAST'])
                firsts.append(row['NAMEFRST'])

        names = ['%s, %s' % (last, first) for last, first in zip(lasts, firsts)]
        normalized_names = [normalize_name(name) for name in names]

        self.X = normalized_names
        self.y = ethnicities

Example #10

0

Show file

def get_author_full_name(s):
    """Get author full name from the signature.

    Parameters
    ----------
    :param s: dict
        Signature

    Returns
    -------
    :returns: string
        Normalized author name
    """
    v = s["author_name"]
    v = normalize_name(v) if v else ""
    return v

Example #11

0

Show file

File: beard_utils.py Project: inspirehep/beard-server

def get_author_affiliation(s):
    """Get author affiliation from the signature.

    Parameters
    ----------
    :param s: dict
        Signature

    Returns
    -------
    :returns: string
        Normalized affiliation name
    """
    v = s["author_affiliation"]
    v = normalize_name(v) if v else ""
    return v

Example #12

0

Show file

def get_author_affiliation(s):
    """Get author affiliation from the signature.

    Parameters
    ----------
    :param s: dict
        Signature

    Returns
    -------
    :returns: string
        Normalized affiliation name
    """
    v = s["author_affiliation"]
    v = normalize_name(v) if v else ""
    return v

Example #13

0

Show file

File: beard_utils.py Project: inspirehep/beard-server

def get_author_full_name(s):
    """Get author full name from the signature.

    Parameters
    ----------
    :param s: dict
        Signature

    Returns
    -------
    :returns: string
        Normalized author name
    """
    v = s["author_name"]
    v = normalize_name(v) if v else ""
    return v

Example #14

0

Show file

    def load_data(self, input_filename):
        ethnicities, lasts, firsts = [], [], []
        with open(input_filename, 'r') as fd:
            reader = csv.DictReader(fd)
            for row in reader:
                ethnicities.append(int(row['RACE']))
                lasts.append(row['NAMELAST'])
                firsts.append(row['NAMEFRST'])

        names = [
            '%s, %s' % (last, first) for last, first in zip(lasts, firsts)
        ]
        normalized_names = [normalize_name(name) for name in names]

        self.X = normalized_names
        self.y = ethnicities

Example #15

0

Show file

def get_author_other_names(s):
    """Get author other names from the signature.

    Parameters
    ----------
    :param s: dict
        Signature

    Returns
    -------
    :returns: string
        Normalized other author names
    """
    v = s["author_name"]
    v = v.split(",", 1)
    v = normalize_name(v[1]) if len(v) == 2 else ""
    return v

Example #16

0

Show file

File: beard_utils.py Project: inspirehep/beard-server

def get_author_other_names(s):
    """Get author other names from the signature.

    Parameters
    ----------
    :param s: dict
        Signature

    Returns
    -------
    :returns: string
        Normalized other author names
    """
    v = s["author_name"]
    v = v.split(",", 1)
    v = normalize_name(v[1]) if len(v) == 2 else ""
    return v

Example #17

0

Show file

def get_author_other_names(s):
    v = s["author_name"]
    v = v.split(",", 1)
    v = normalize_name(v[1]) if len(v) == 2 else ""
    return v

Example #18

0

Show file

def get_author_affiliation(s):
    v = s["author_affiliation"]
    v = normalize_name(v) if v else ""
    return v

Example #19

0

Show file

def get_author_full_name(signature):
    return normalize_name(signature['author_name'])

Example #20

0

Show file

File: models.py Project: harunurhan/inspire-next

def get_author_full_name(signature):
    return normalize_name(signature['author_name'])

Example #21

0

Show file

def get_author_affiliation(signature):
    author_affiliation = signature['author_affiliation']
    return normalize_name(author_affiliation) if author_affiliation else ''

Example #22

0

Show file

File: ethnicity.py Project: MSusik/beard


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_datafile", required=True, type=str)
    parser.add_argument("--output_ethnicity_estimator",
                        default="ethnicity_estimator.pickle", type=str)
    parser.add_argument("--C", default=4.0, type=float)
    args = parser.parse_args()

    # Load data
    data = pd.read_csv(args.input_datafile)
    y = data.RACE.values
    X = ["%s, %s" % (last, first) for last, first in zip(data.NAMELAST.values,
                                                         data.NAMEFRST.values)]
    X = [normalize_name(name) for name in X]

    # Train an estimator
    estimator = Pipeline([
        ("transformer", TfidfVectorizer(analyzer="char_wb",
                                        ngram_range=(1, 5),
                                        min_df=0.00005,
                                        dtype=np.float32,
                                        decode_error="replace")),
        ("classifier", LinearSVC(C=args.C))])
    estimator.fit(X, y)

    pickle.dump(estimator,
                open(args.output_ethnicity_estimator, "w"),
                protocol=pickle.HIGHEST_PROTOCOL)

Example #23

0

Show file

File: models.py Project: harunurhan/inspire-next

def get_author_affiliation(signature):
    author_affiliation = signature['author_affiliation']
    return normalize_name(author_affiliation) if author_affiliation else ''

Example #24

0

Show file

File: models.py Project: harunurhan/inspire-next

def get_author_other_names(signature):
    author_name = signature['author_name']
    other_names = author_name.split(',', 1)
    return normalize_name(other_names[1]) if len(other_names) == 2 else ''

Example #25

0

Show file

def get_author_full_name(s):
    v = s["author_name"]
    v = normalize_name(v) if v else ""
    return v

Example #26

0

Show file

    parser = argparse.ArgumentParser()
    parser.add_argument("--input_datafile", required=True, type=str)
    parser.add_argument("--output_ethnicity_estimator",
                        default="ethnicity_estimator.pickle",
                        type=str)
    parser.add_argument("--C", default=4.0, type=float)
    args = parser.parse_args()

    # Load data
    data = pd.read_csv(args.input_datafile)
    y = data.RACE.values
    X = [
        "%s, %s" % (last, first)
        for last, first in zip(data.NAMELAST.values, data.NAMEFRST.values)
    ]
    X = [normalize_name(name) for name in X]

    # Train an estimator
    estimator = Pipeline([("transformer",
                           TfidfVectorizer(analyzer="char_wb",
                                           ngram_range=(1, 5),
                                           min_df=0.00005,
                                           dtype=np.float32,
                                           decode_error="replace")),
                          ("classifier", LinearSVC(C=args.C))])
    estimator.fit(X, y)

    pickle.dump(estimator,
                open(args.output_ethnicity_estimator, "w"),
                protocol=pickle.HIGHEST_PROTOCOL)

Example #27

0

Show file

def get_author_other_names(signature):
    author_name = signature['author_name']
    other_names = author_name.split(',', 1)
    return normalize_name(other_names[1]) if len(other_names) == 2 else ''