def load_data(self, input_filename):
        """Loads ethnicity data from file

        Args:
            input_filename (str): Path to data file

        Example:
            RACE,NAMELAST,NAMEFRST
            1,SHERIDAN,CHARLES B
            2,TAYLOR,HERDSON
            3,JOHNSON,LUCY A
        """
        ethnicities, lasts, firsts = [], [], []
        with open(input_filename, "r") as fd:
            reader = csv.DictReader(fd)
            for row in reader:
                ethnicities.append(int(row["RACE"]))
                lasts.append(row["NAMELAST"])
                firsts.append(row["NAMEFRST"])

        names = [
            "%s, %s" % (last, first) for last, first in zip(lasts, firsts)
        ]
        normalized_names = [normalize_name(name) for name in names]

        self.X = normalized_names
        self.y = ethnicities
예제 #2
0
    def last_name_first_initial(name):
        names = normalize_name(name).split(" ", 1)

        try:
            name = "%s %s" % (names[0], names[1].strip()[0])
        except IndexError:
            name = names[0]

        return name
예제 #3
0
    def last_name_first_initial(name):
        names = normalize_name(name).split(" ", 1)

        try:
            name = "%s %s" % (names[0], names[1].strip()[0])
        except IndexError:
            name = names[0]

        return name
def get_author_full_name(signature):
    """Get author_name normalized.

    Args:
        signature (Signature): Signature object

    Returns:
        str: Normalized `signature.author_name` or empty string if None

    """
    return normalize_name(signature.author_name)
def get_normalized_affiliation(signature):
    """Get author_affiliations normalized.

    Args:
        signature (Signature): Signature object

    Returns:
        str: Normalized `signature.author_affiliation` or empty string if None

    """
    author_affiliation = signature.author_affiliation
    return normalize_name(author_affiliation) if author_affiliation else ""
def get_author_other_names(signature):
    """Get other names of author normalized.

    Args:
        signature (Signature): Signature object

    Returns:
        str: Normalized other names of author

    """
    author_name = signature.author_name
    other_names = author_name.split(",", 1)
    return normalize_name(other_names[1]) if len(other_names) == 2 else ""
예제 #7
0
def affinity(X):
    """Compute pairwise distances between (author, affiliation) tuples.

    Note that this function is a heuristic. It should ideally be replaced
    by a more robust distance function, e.g. using a model learned over
    pairs of tuples.
    """
    distances = np.zeros((len(X), len(X)), dtype=np.float)

    for i, j in zip(*np.triu_indices(len(X), k=1)):
        name_i = normalize_name(X[i, 0])
        aff_i = X[i, 1]
        initials_i = name_initials(name_i)
        name_j = normalize_name(X[j, 0])
        aff_j = X[j, 1]
        initials_j = name_initials(name_j)

        # Names and affiliations match
        if name_i == name_j and aff_i == aff_j:
            distances[i, j] = 0.0

        # Compatible initials and affiliations match
        elif (len(initials_i | initials_j) == max(len(initials_i),
                                                  len(initials_j))
              and aff_i == aff_j and aff_i != ""):
            distances[i, j] = 0.0

        # Initials are not compatible
        elif (len(initials_i | initials_j) != max(len(initials_i),
                                                  len(initials_j))):
            distances[i, j] = 1.0

        # We dont know
        else:
            distances[i, j] = 0.5

    distances += distances.T
    return distances
예제 #8
0
def affinity(X):
    """Compute pairwise distances between (author, affiliation) tuples.

    Note that this function is a heuristic. It should ideally be replaced
    by a more robust distance function, e.g. using a model learned over
    pairs of tuples.
    """
    distances = np.zeros((len(X), len(X)), dtype=np.float)

    for i, j in zip(*np.triu_indices(len(X), k=1)):
        name_i = normalize_name(X[i, 0])
        aff_i = X[i, 1]
        initials_i = name_initials(name_i)
        name_j = normalize_name(X[j, 0])
        aff_j = X[j, 1]
        initials_j = name_initials(name_j)

        # Names and affiliations match
        if (name_i == name_j and aff_i == aff_j):
            distances[i, j] = 0.0

        # Compatible initials and affiliations match
        elif (len(initials_i | initials_j) == max(len(initials_i),
                                                  len(initials_j)) and
              aff_i == aff_j and aff_i != ""):
            distances[i, j] = 0.0

        # Initials are not compatible
        elif (len(initials_i | initials_j) != max(len(initials_i),
                                                  len(initials_j))):
            distances[i, j] = 1.0

        # We dont know
        else:
            distances[i, j] = 0.5

    distances += distances.T
    return distances
예제 #9
0
    def load_data(self, input_filename):
        ethnicities, lasts, firsts = [], [], []
        with open(input_filename, 'r') as fd:
            reader = csv.DictReader(fd)
            for row in reader:
                ethnicities.append(int(row['RACE']))
                lasts.append(row['NAMELAST'])
                firsts.append(row['NAMEFRST'])

        names = ['%s, %s' % (last, first) for last, first in zip(lasts, firsts)]
        normalized_names = [normalize_name(name) for name in names]

        self.X = normalized_names
        self.y = ethnicities
예제 #10
0
def get_author_full_name(s):
    """Get author full name from the signature.

    Parameters
    ----------
    :param s: dict
        Signature

    Returns
    -------
    :returns: string
        Normalized author name
    """
    v = s["author_name"]
    v = normalize_name(v) if v else ""
    return v
예제 #11
0
def get_author_affiliation(s):
    """Get author affiliation from the signature.

    Parameters
    ----------
    :param s: dict
        Signature

    Returns
    -------
    :returns: string
        Normalized affiliation name
    """
    v = s["author_affiliation"]
    v = normalize_name(v) if v else ""
    return v
예제 #12
0
def get_author_affiliation(s):
    """Get author affiliation from the signature.

    Parameters
    ----------
    :param s: dict
        Signature

    Returns
    -------
    :returns: string
        Normalized affiliation name
    """
    v = s["author_affiliation"]
    v = normalize_name(v) if v else ""
    return v
예제 #13
0
def get_author_full_name(s):
    """Get author full name from the signature.

    Parameters
    ----------
    :param s: dict
        Signature

    Returns
    -------
    :returns: string
        Normalized author name
    """
    v = s["author_name"]
    v = normalize_name(v) if v else ""
    return v
예제 #14
0
    def load_data(self, input_filename):
        ethnicities, lasts, firsts = [], [], []
        with open(input_filename, 'r') as fd:
            reader = csv.DictReader(fd)
            for row in reader:
                ethnicities.append(int(row['RACE']))
                lasts.append(row['NAMELAST'])
                firsts.append(row['NAMEFRST'])

        names = [
            '%s, %s' % (last, first) for last, first in zip(lasts, firsts)
        ]
        normalized_names = [normalize_name(name) for name in names]

        self.X = normalized_names
        self.y = ethnicities
예제 #15
0
def get_author_other_names(s):
    """Get author other names from the signature.

    Parameters
    ----------
    :param s: dict
        Signature

    Returns
    -------
    :returns: string
        Normalized other author names
    """
    v = s["author_name"]
    v = v.split(",", 1)
    v = normalize_name(v[1]) if len(v) == 2 else ""
    return v
예제 #16
0
def get_author_other_names(s):
    """Get author other names from the signature.

    Parameters
    ----------
    :param s: dict
        Signature

    Returns
    -------
    :returns: string
        Normalized other author names
    """
    v = s["author_name"]
    v = v.split(",", 1)
    v = normalize_name(v[1]) if len(v) == 2 else ""
    return v
예제 #17
0
def get_author_other_names(s):
    v = s["author_name"]
    v = v.split(",", 1)
    v = normalize_name(v[1]) if len(v) == 2 else ""
    return v
예제 #18
0
def get_author_affiliation(s):
    v = s["author_affiliation"]
    v = normalize_name(v) if v else ""
    return v
예제 #19
0
def get_author_full_name(signature):
    return normalize_name(signature['author_name'])
예제 #20
0
def get_author_full_name(signature):
    return normalize_name(signature['author_name'])
예제 #21
0
def get_author_affiliation(signature):
    author_affiliation = signature['author_affiliation']
    return normalize_name(author_affiliation) if author_affiliation else ''
예제 #22
0
파일: ethnicity.py 프로젝트: MSusik/beard

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_datafile", required=True, type=str)
    parser.add_argument("--output_ethnicity_estimator",
                        default="ethnicity_estimator.pickle", type=str)
    parser.add_argument("--C", default=4.0, type=float)
    args = parser.parse_args()

    # Load data
    data = pd.read_csv(args.input_datafile)
    y = data.RACE.values
    X = ["%s, %s" % (last, first) for last, first in zip(data.NAMELAST.values,
                                                         data.NAMEFRST.values)]
    X = [normalize_name(name) for name in X]

    # Train an estimator
    estimator = Pipeline([
        ("transformer", TfidfVectorizer(analyzer="char_wb",
                                        ngram_range=(1, 5),
                                        min_df=0.00005,
                                        dtype=np.float32,
                                        decode_error="replace")),
        ("classifier", LinearSVC(C=args.C))])
    estimator.fit(X, y)

    pickle.dump(estimator,
                open(args.output_ethnicity_estimator, "w"),
                protocol=pickle.HIGHEST_PROTOCOL)
예제 #23
0
def get_author_affiliation(signature):
    author_affiliation = signature['author_affiliation']
    return normalize_name(author_affiliation) if author_affiliation else ''
예제 #24
0
def get_author_other_names(signature):
    author_name = signature['author_name']
    other_names = author_name.split(',', 1)
    return normalize_name(other_names[1]) if len(other_names) == 2 else ''
예제 #25
0
def get_author_full_name(s):
    v = s["author_name"]
    v = normalize_name(v) if v else ""
    return v
예제 #26
0
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_datafile", required=True, type=str)
    parser.add_argument("--output_ethnicity_estimator",
                        default="ethnicity_estimator.pickle",
                        type=str)
    parser.add_argument("--C", default=4.0, type=float)
    args = parser.parse_args()

    # Load data
    data = pd.read_csv(args.input_datafile)
    y = data.RACE.values
    X = [
        "%s, %s" % (last, first)
        for last, first in zip(data.NAMELAST.values, data.NAMEFRST.values)
    ]
    X = [normalize_name(name) for name in X]

    # Train an estimator
    estimator = Pipeline([("transformer",
                           TfidfVectorizer(analyzer="char_wb",
                                           ngram_range=(1, 5),
                                           min_df=0.00005,
                                           dtype=np.float32,
                                           decode_error="replace")),
                          ("classifier", LinearSVC(C=args.C))])
    estimator.fit(X, y)

    pickle.dump(estimator,
                open(args.output_ethnicity_estimator, "w"),
                protocol=pickle.HIGHEST_PROTOCOL)
예제 #27
0
def get_author_other_names(signature):
    author_name = signature['author_name']
    other_names = author_name.split(',', 1)
    return normalize_name(other_names[1]) if len(other_names) == 2 else ''