def load_data(self, input_filename): """Loads ethnicity data from file Args: input_filename (str): Path to data file Example: RACE,NAMELAST,NAMEFRST 1,SHERIDAN,CHARLES B 2,TAYLOR,HERDSON 3,JOHNSON,LUCY A """ ethnicities, lasts, firsts = [], [], [] with open(input_filename, "r") as fd: reader = csv.DictReader(fd) for row in reader: ethnicities.append(int(row["RACE"])) lasts.append(row["NAMELAST"]) firsts.append(row["NAMEFRST"]) names = [ "%s, %s" % (last, first) for last, first in zip(lasts, firsts) ] normalized_names = [normalize_name(name) for name in names] self.X = normalized_names self.y = ethnicities
def last_name_first_initial(name): names = normalize_name(name).split(" ", 1) try: name = "%s %s" % (names[0], names[1].strip()[0]) except IndexError: name = names[0] return name
def get_author_full_name(signature): """Get author_name normalized. Args: signature (Signature): Signature object Returns: str: Normalized `signature.author_name` or empty string if None """ return normalize_name(signature.author_name)
def get_normalized_affiliation(signature): """Get author_affiliations normalized. Args: signature (Signature): Signature object Returns: str: Normalized `signature.author_affiliation` or empty string if None """ author_affiliation = signature.author_affiliation return normalize_name(author_affiliation) if author_affiliation else ""
def get_author_other_names(signature): """Get other names of author normalized. Args: signature (Signature): Signature object Returns: str: Normalized other names of author """ author_name = signature.author_name other_names = author_name.split(",", 1) return normalize_name(other_names[1]) if len(other_names) == 2 else ""
def affinity(X): """Compute pairwise distances between (author, affiliation) tuples. Note that this function is a heuristic. It should ideally be replaced by a more robust distance function, e.g. using a model learned over pairs of tuples. """ distances = np.zeros((len(X), len(X)), dtype=np.float) for i, j in zip(*np.triu_indices(len(X), k=1)): name_i = normalize_name(X[i, 0]) aff_i = X[i, 1] initials_i = name_initials(name_i) name_j = normalize_name(X[j, 0]) aff_j = X[j, 1] initials_j = name_initials(name_j) # Names and affiliations match if name_i == name_j and aff_i == aff_j: distances[i, j] = 0.0 # Compatible initials and affiliations match elif (len(initials_i | initials_j) == max(len(initials_i), len(initials_j)) and aff_i == aff_j and aff_i != ""): distances[i, j] = 0.0 # Initials are not compatible elif (len(initials_i | initials_j) != max(len(initials_i), len(initials_j))): distances[i, j] = 1.0 # We dont know else: distances[i, j] = 0.5 distances += distances.T return distances
def affinity(X): """Compute pairwise distances between (author, affiliation) tuples. Note that this function is a heuristic. It should ideally be replaced by a more robust distance function, e.g. using a model learned over pairs of tuples. """ distances = np.zeros((len(X), len(X)), dtype=np.float) for i, j in zip(*np.triu_indices(len(X), k=1)): name_i = normalize_name(X[i, 0]) aff_i = X[i, 1] initials_i = name_initials(name_i) name_j = normalize_name(X[j, 0]) aff_j = X[j, 1] initials_j = name_initials(name_j) # Names and affiliations match if (name_i == name_j and aff_i == aff_j): distances[i, j] = 0.0 # Compatible initials and affiliations match elif (len(initials_i | initials_j) == max(len(initials_i), len(initials_j)) and aff_i == aff_j and aff_i != ""): distances[i, j] = 0.0 # Initials are not compatible elif (len(initials_i | initials_j) != max(len(initials_i), len(initials_j))): distances[i, j] = 1.0 # We dont know else: distances[i, j] = 0.5 distances += distances.T return distances
def load_data(self, input_filename): ethnicities, lasts, firsts = [], [], [] with open(input_filename, 'r') as fd: reader = csv.DictReader(fd) for row in reader: ethnicities.append(int(row['RACE'])) lasts.append(row['NAMELAST']) firsts.append(row['NAMEFRST']) names = ['%s, %s' % (last, first) for last, first in zip(lasts, firsts)] normalized_names = [normalize_name(name) for name in names] self.X = normalized_names self.y = ethnicities
def get_author_full_name(s): """Get author full name from the signature. Parameters ---------- :param s: dict Signature Returns ------- :returns: string Normalized author name """ v = s["author_name"] v = normalize_name(v) if v else "" return v
def get_author_affiliation(s): """Get author affiliation from the signature. Parameters ---------- :param s: dict Signature Returns ------- :returns: string Normalized affiliation name """ v = s["author_affiliation"] v = normalize_name(v) if v else "" return v
def load_data(self, input_filename): ethnicities, lasts, firsts = [], [], [] with open(input_filename, 'r') as fd: reader = csv.DictReader(fd) for row in reader: ethnicities.append(int(row['RACE'])) lasts.append(row['NAMELAST']) firsts.append(row['NAMEFRST']) names = [ '%s, %s' % (last, first) for last, first in zip(lasts, firsts) ] normalized_names = [normalize_name(name) for name in names] self.X = normalized_names self.y = ethnicities
def get_author_other_names(s): """Get author other names from the signature. Parameters ---------- :param s: dict Signature Returns ------- :returns: string Normalized other author names """ v = s["author_name"] v = v.split(",", 1) v = normalize_name(v[1]) if len(v) == 2 else "" return v
def get_author_other_names(s): v = s["author_name"] v = v.split(",", 1) v = normalize_name(v[1]) if len(v) == 2 else "" return v
def get_author_affiliation(s): v = s["author_affiliation"] v = normalize_name(v) if v else "" return v
def get_author_full_name(signature): return normalize_name(signature['author_name'])
def get_author_affiliation(signature): author_affiliation = signature['author_affiliation'] return normalize_name(author_affiliation) if author_affiliation else ''
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--input_datafile", required=True, type=str) parser.add_argument("--output_ethnicity_estimator", default="ethnicity_estimator.pickle", type=str) parser.add_argument("--C", default=4.0, type=float) args = parser.parse_args() # Load data data = pd.read_csv(args.input_datafile) y = data.RACE.values X = ["%s, %s" % (last, first) for last, first in zip(data.NAMELAST.values, data.NAMEFRST.values)] X = [normalize_name(name) for name in X] # Train an estimator estimator = Pipeline([ ("transformer", TfidfVectorizer(analyzer="char_wb", ngram_range=(1, 5), min_df=0.00005, dtype=np.float32, decode_error="replace")), ("classifier", LinearSVC(C=args.C))]) estimator.fit(X, y) pickle.dump(estimator, open(args.output_ethnicity_estimator, "w"), protocol=pickle.HIGHEST_PROTOCOL)
def get_author_other_names(signature): author_name = signature['author_name'] other_names = author_name.split(',', 1) return normalize_name(other_names[1]) if len(other_names) == 2 else ''
def get_author_full_name(s): v = s["author_name"] v = normalize_name(v) if v else "" return v
parser = argparse.ArgumentParser() parser.add_argument("--input_datafile", required=True, type=str) parser.add_argument("--output_ethnicity_estimator", default="ethnicity_estimator.pickle", type=str) parser.add_argument("--C", default=4.0, type=float) args = parser.parse_args() # Load data data = pd.read_csv(args.input_datafile) y = data.RACE.values X = [ "%s, %s" % (last, first) for last, first in zip(data.NAMELAST.values, data.NAMEFRST.values) ] X = [normalize_name(name) for name in X] # Train an estimator estimator = Pipeline([("transformer", TfidfVectorizer(analyzer="char_wb", ngram_range=(1, 5), min_df=0.00005, dtype=np.float32, decode_error="replace")), ("classifier", LinearSVC(C=args.C))]) estimator.fit(X, y) pickle.dump(estimator, open(args.output_ethnicity_estimator, "w"), protocol=pickle.HIGHEST_PROTOCOL)