Esempio n. 1
0
def fuzzy_match(txt1, txt2):
    s1 = [
        re.sub(r'[^a-zA-Z0-9]', '', x[0]).lower() for x in pp.parse(txt1)
        if x[1] == 'CorporationName'
    ]
    s2 = [
        re.sub(r'[^a-zA-Z0-9]', '', x[0]).lower() for x in pp.parse(txt2)
        if x[1] == 'CorporationName'
    ]
    l1, l2 = len(s1), len(s2)
    set1 = {
        " ".join(s1[j:j + i])
        for i in range(1, l1 + 1) for j in range(l1 + 1 - i)
    }
    set2 = {
        " ".join(s2[j:j + i])
        for i in range(1, l2 + 1) for j in range(l2 + 1 - i)
    }
    r = sorted(set1.intersection(set2), key=lambda x: len(x.split()))
    rs = [
        j for k, j in enumerate(r)
        if all(j not in r[i] for i in range(k + 1, len(r)))
    ]
    rs = [x.split() for x in rs if len(x.split()) > 1]
    matched_len = sum([len(x) for x in rs])
    base = np.mean([len(s1), len(s2)])
    matched_score = matched_len / base if base > 0 else 0
    return min(1, matched_score)
def addFailedPreds(tagged_list, train_file):
    print("adding failures")
    i = 0
    added = 0
    for index, tagged_item in enumerate(tagged_list):
        if index % 20 == 0:
            print()

        if probablepeople.parse(tagged_item[0]) == [tagged_item]:
            print(".", end=" ")

        else:
            data_prep_utils.appendListToXMLfile([[tagged_item]], train_file)
            print("*", end=" ")
            added += 1
            i += 1
            if added == 10:
                added = 0
                print("\n", "-" * 50, "RETRAINING ", index)
                training_data = list(data_prep_utils.parseTrainingData("training/training_data/labeled.xml"))
                trainModel(training_data, "probablepeople/learned_settings.crfsuite")

    print("\n", "-" * 50, "RETRAINING")
    training_data = list(data_prep_utils.parseTrainingData("training/training_data/labeled.xml"))
    trainModel(training_data, "probablepeople/learned_settings.crfsuite")
    print(i, " cases added to ", train_file)
Esempio n. 3
0
def find_repeated_label(name_, type=None):
    """Analyzes People names that raise RepeatedLabelErrors

    Args:
        name_ (str): Name of a person e.g. 'Cpt. James T. Kirk'
        type (str, optional): type of parser probablepeople should use, one of 'person', 'company',
        or None (None=defualt which implies 'generic')

    Returns:
        problem_key: str    the first label that was repeated
        problem_vals: list  a list of parsed values with that label
        nameparts_so_far: dict   a dictionary that includes the problem key with one of the
            possible values
    """
    parsed = probablepeople.parse(name_, type)

    nameparts_so_far = {}
    problem_key = None
    problem_vals = set()

    for (v, k) in parsed:
        if k in nameparts_so_far.keys():
            problem_key = k
            problem_vals.add(v)
            problem_vals.add(nameparts_so_far[k])
        nameparts_so_far[k] = v

    return problem_key, problem_vals, nameparts_so_far
Esempio n. 4
0
def addFailedPreds(tagged_list, train_file):
    print("adding failures")
    i = 0
    added = 0
    for index, tagged_item in enumerate(tagged_list):
        if index % 20 == 0:
            print()

        if probablepeople.parse(tagged_item[0]) == [tagged_item]:
            print(".", end=' ')

        else:
            data_prep_utils.appendListToXMLfile([[tagged_item]], train_file)
            print("*", end=' ')
            added += 1
            i += 1
            if added == 10:
                added = 0
                print("\n", "-" * 50, "RETRAINING ", index)
                training_data = list(
                    data_prep_utils.parseTrainingData(
                        'training/training_data/labeled.xml'))
                trainModel(training_data,
                           'probablepeople/learned_settings.crfsuite')

    print("\n", "-" * 50, "RETRAINING")
    training_data = list(
        data_prep_utils.parseTrainingData(
            'training/training_data/labeled.xml'))
    trainModel(training_data, 'probablepeople/learned_settings.crfsuite')
    print(i, " cases added to ", train_file)
def get_candidate_info(row):
    cand_info = re.split(r"\s{2,}", row)
    raw = cand_info[0].split('(')
    cand_name = raw[0]
    full_name = pp.parse(cand_name, 'person')
    cand_party = raw[1][:3]

    return cand_info, full_name, cand_party
Esempio n. 6
0
def pp_contact_name(contact, leave_case=False):
    split_name_parts = pp.parse(contact['PERSON'], type='person')
    for split_name_part in split_name_parts:
        if split_name_part[1] in ['GivenName', 'Surname', 'MiddleName']:
            name = re.sub(r"[^a-zA-Z-']+", '', split_name_part[0])
            contact[
                split_name_part[1]] = name.lower() if not leave_case else name
    return contact
Esempio n. 7
0
    def test_performance(self) :
        test_file = 'tests/test_data_labeled.xml'
        test_data = list(readTrainingData([test_file], GROUP_LABEL))

        for labeled_name in test_data :
            raw_string, components = labeled_name
            _, labels_true = list(zip(*components))
            _, labels_pred = list(zip(*parse(raw_string)))
            yield equals, raw_string, labels_pred, labels_true
    def test_performance(self):
        test_file = "tests/test_data_labeled.xml"
        test_data = list(readTrainingData([test_file], GROUP_LABEL))

        for labeled_name in test_data:
            raw_string, components = labeled_name
            _, labels_true = list(zip(*components))
            _, labels_pred = list(zip(*parse(raw_string)))
            yield equals, raw_string, labels_pred, labels_true
Esempio n. 9
0
def getIncorrect(name_list, correct_tag):
    incorrect_list = []
    for name in name_list:
        labeled_sequence = probablepeople.parse(name)
        string, label = labeled_sequence[0]
        if label != correct_tag:
            incorrect_list.append([(string, correct_tag)])
    print(len(incorrect_list), "/", len(name_list), " incorrect, ",
          int(float(len(incorrect_list)) / float(len(name_list)) * 100), "%")
    return incorrect_list
Esempio n. 10
0
def getIncorrect(name_list, correct_tag):
    incorrect_list = []
    for name in name_list:
        labeled_sequence = probablepeople.parse(name)
        string, label = labeled_sequence[0]
        if label != correct_tag:
            incorrect_list.append([(string, correct_tag)])
    print(
        len(incorrect_list),
        "/",
        len(name_list),
        " incorrect, ",
        int(float(len(incorrect_list)) / float(len(name_list)) * 100),
        "%",
    )
    return incorrect_list
Esempio n. 11
0
def get_names(name_str: str) -> Tuple[List[Name], Optional[str]]:
    if not name_str:
        return [], None
    name_str = " ".join([
        component.title() for component in name_str.split()
        if component.lower() not in remove_list
    ])
    try:
        tagged, category = tag(name_str)
        if category == CORP:
            return [Name(entity_name=name_str)], category
        elif category == PERSON:
            return tagged_name_retrieve(tagged), PERSON
        else:  # Household (ie: multiple names) should be the only other option
            return parsed_name_retrieve(parse(name_str)), category
    except RepeatedLabelError as e:
        tag_list = e.parsed_string  # this is actually the same as parse(<name>)
        return parsed_name_retrieve(tag_list), None
def ID_RemoveNames(df):
    named_list = []
    for i in range(len(df)):
        named_list.append(pp.parse(df.search_term[i]))

    named_list
    namesdf = pd.DataFrame(named_list)
    namesdf = namesdf.drop(namesdf.iloc[:, 5:24], axis=1)
    namesdf = namesdf.rename(columns={
        0: "name0",
        1: "name1",
        2: "name2",
        3: "name3",
        4: "name4"
    })

    namesdf.fillna("no value", inplace=True)
    namesdf = namesdf.applymap(str)

    gvnnames = namesdf[(namesdf.name0.str.contains("GivenName"))]
    gvnnames = gvnnames.drop_duplicates()
    gvnnames = gvnnames[["name0", "name1"]]
    gvnnames.reset_index(drop=True, inplace=True)

    # new data frame with split value columns
    new = gvnnames["name0"].str.split(",", n=1, expand=True)
    new2 = gvnnames["name1"].str.split(",", n=1, expand=True)

    # # making separate first name column from new data frame
    gvnnames["firstname"] = new[0]
    gvnnames["lastname"] = new2[0]

    # # making separate last name column from new data frame
    gvnnames["name_type1"] = new[1]
    gvnnames["name_type2"] = new2[1]

    # # Dropping old Name columns
    # data.drop(columns =["Name"], inplace = True)

    gvnnames1 = gvnnames[["firstname", "name_type1", "lastname", "name_type2"]]

    gvnnames1[gvnnames1.columns] = gvnnames1.apply(lambda x: x.str.strip("("))
    gvnnames1[gvnnames1.columns] = gvnnames1.apply(lambda x: x.str.strip(")"))
    gvnnames1[gvnnames1.columns] = gvnnames1.apply(lambda x: x.str.strip("''"))

    nlp = spacy.load("en_core_web_lg")
    for col_name in ["firstname", "lastname"]:
        gvnnames1[col_name] = gvnnames1[col_name].apply(nlp)

    entities1 = []
    for i in range(len(gvnnames1)):
        entities1.append([(X.pos_, X.ent_type_, X.head.pos_, X.tag_)
                          for Y in gvnnames1["firstname"][i].ents for X in Y])

    entities2 = []
    for i in range(len(gvnnames1)):
        entities2.append([(X.pos_, X.ent_type_, X.head.pos_, X.tag_)
                          for Y in gvnnames1["lastname"][i].ents for X in Y])

    gvnnames1["entities1"] = entities1
    gvnnames1["entities1"] = [
        (str(i).translate(str.maketrans("", "", string.punctuation)))
        for i in gvnnames1["entities1"]
    ]
    gvnnames1["entities2"] = entities2
    gvnnames1["entities2"] = [
        (str(i).translate(str.maketrans("", "", string.punctuation)))
        for i in gvnnames1["entities2"]
    ]

    gvnnames1 = gvnnames1[(
        gvnnames1.entities1.str.contains("PROPN PERSON PROPN NNP", regex=True))
                          | (gvnnames1.entities2.str.
                             contains("PROPN PERSON PROPN NNP", regex=True))]
    gvnnames1 = gvnnames1.applymap(str)

    names = gvnnames1.assign(name=gvnnames1[["firstname", "lastname"]].apply(
        " ".join, axis=1)).drop(
            [
                "firstname", "lastname", "name_type1", "name_type2",
                "entities1", "entities2"
            ],
            axis=1,
        )
    names = names.drop_duplicates()
    names = names.name.tolist()
    names = remove_punct(names)
    df = df[(~df.search_term.isin(names))]
    print("id names removed")
    return df
        'GivenName', 'MiddleInitial', 'And', 'GivenName', 'MiddleInitial',
        'Surname'
    ]),
    ('a. smith', ['FirstInitial', 'Surname']),
    ('a smith', ['FirstInitial', 'Surname']),
    ('a smith d.o.', ['FirstInitial', 'Surname', 'SuffixOther']),
    ('a smith iii', ['FirstInitial', 'Surname', 'SuffixGenerational']),
    ('a smith DO', ['FirstInitial', 'Surname', 'SuffixOther']),
    ('belcher, bob b', ['Surname', 'GivenName', 'MiddleInitial']),
    ('belcher, bob b jr',
     ['Surname', 'GivenName', 'MiddleInitial', 'SuffixGenerational']),
    ('Belcher, Bob B. IV',
     ['Surname', 'GivenName', 'MiddleInitial', 'SuffixGenerational']),
]

failed = 0
for string_tuple in test_strings:
    labels_true = string_tuple[1]
    parsed = parse(string_tuple[0])
    labels_pred = [token[1] for token in parsed]
    if labels_pred == labels_true:
        print(string_tuple[0], "...ok")
    else:
        failed += 1
        print("*" * 40)
        print(string_tuple[0], "...INCORRECT PARSING")
        print("pred: ", labels_pred)
        print("true: ", labels_true)
        print("-" * 40)

print("Failed", failed, "out of", len(test_strings), "strings")
Esempio n. 14
0
def scrape_dupage():

    COUNTY_NAME = "DuPage"
    # sets URLs
    DUPAGE_RACE_URL = get_results_url()
    PRECINCTS_URL = get_precincts_url()

    # gets data
    data = requests.get(DUPAGE_RACE_URL).json()
    precincts_data = requests.get(PRECINCTS_URL).json()

    # gets precinct info
    precincts_reporting = precincts_data['settings'][
        'numberofprecinctsreporting']
    precincts_total = precincts_data['settings']['totalprecinctsreporting']

    # creates empty list for results info
    dupage_county_results = []

    for datum in data:

        if datum['CAT'] == "Propositions":
            options = datum['CH']
            votes = datum['V']

            race_obj = initialize_race_obj(datum['C'], precincts_reporting,
                                           precincts_total, COUNTY_NAME)

            for option_index, (option, vote) in enumerate(zip(options, votes)):
                if option == "Yes/Sí":  # specific to DuPage
                    option = "Yes"
                else:
                    pass

                race_obj["reporting_units"][0]['candidates'].append({
                    "first_name":
                    "",
                    "middle_name":
                    "",
                    "last_name":
                    option.title(),
                    "vote_count":
                    int(vote),
                    "ballot_order":
                    int(option_index + 1)
                })

            dupage_county_results.append(race_obj)

        elif datum['CAT'] == "County" or datum['CAT'] == "Judicial":
            candidates = datum['CH']
            cand_votes = datum['V']
            cand_parties = datum['P']

            race_obj = initialize_race_obj(datum['C'], precincts_reporting,
                                           precincts_total, COUNTY_NAME)

            for cand_index, (candidate, cand_vote, cand_party) in enumerate(
                    zip(candidates, cand_votes, cand_parties)):
                if candidate == "Yes/Sí":  # specific to DuPage
                    candidate = "Yes"
                else:
                    pass

                full_name = pp.parse(
                    candidate,
                    'person')  # uses probablepeople to parse names into a list
                first_name, middle_name, last_name = parse_name(full_name)

                race_obj["reporting_units"][0]['candidates'].append({
                    "first_name":
                    first_name,
                    "middle_name":
                    middle_name,
                    "last_name":
                    last_name,
                    "vote_count":
                    int(cand_vote),
                    "party":
                    cand_party,
                    "ballot_order":
                    int(cand_index + 1)
                })

            dupage_county_results.append(race_obj)

    with open('scrapers/dupage_data.json', 'w', encoding='utf-8') as f:
        json.dump(dupage_county_results, f, ensure_ascii=False, indent=4)

    return dupage_county_results


# this should be commented out when running the app
# leave it in if you're just testing the scraper
# scrape_dupage()
Esempio n. 15
0
def get_possible_email(contact_name, email):
    split_name_parts = []
    try:
        split_name_parts = pp.parse(contact_name, type='person')
    except TypeError as e:
        logger.error("possible_email: " + str(e) + ' - ' +
                     json.dumps(contact_name))
    given_name = ''
    surname = ''
    for split_name_part in split_name_parts:
        if split_name_part[1] == 'Surname':
            surname = split_name_part[0].lower()
        if split_name_part[1] == 'GivenName':
            given_name = split_name_part[0].lower()
    email = email.lower()
    email_parts = email_split(email)
    possible_emails = {}
    domain = email_parts.domain
    surname_first_letter = surname[0] if len(surname) else ''
    given_name_first_letter = given_name[0] if len(given_name) else ''

    possible_emails['given_name|surname'] = "%s%s@%s" % (given_name, surname,
                                                         domain)
    possible_emails['given_name|.|surname'] = "%s.%s@%s" % (given_name,
                                                            surname, domain)
    possible_emails['given_name|-|surname'] = "%s-%s@%s" % (given_name,
                                                            surname, domain)
    possible_emails['given_name|_|surname'] = "%s_%s@%s" % (given_name,
                                                            surname, domain)
    possible_emails['surname|given_name'] = "%s%s@%s" % (given_name, surname,
                                                         domain)
    possible_emails['surname|.|given_name'] = "%s.%s@%s" % (surname,
                                                            given_name, domain)
    possible_emails['surname|-|given_name'] = "%s-%s@%s" % (surname,
                                                            given_name, domain)
    possible_emails['surname|_|given_name'] = "%s_%s@%s" % (surname,
                                                            given_name, domain)
    possible_emails['surname0|.|given_name'] = "%s.%s@%s" % (
        surname_first_letter, given_name, domain)
    possible_emails['surname0|given_name'] = "%s%s@%s" % (surname_first_letter,
                                                          given_name, domain)
    possible_emails['surname0|-|given_name'] = "%s-%s@%s" % (
        surname_first_letter, given_name, domain)
    possible_emails['surname0|_|given_name'] = "%s_%s@%s" % (
        surname_first_letter, given_name, domain)
    possible_emails['given_name0|.|surname'] = "%s.%s@%s" % (
        given_name_first_letter, surname, domain)
    possible_emails['given_name0|surname'] = "%s%s@%s" % (
        given_name_first_letter, surname, domain)
    possible_emails['given_name0|-|surname'] = "%s-%s@%s" % (
        given_name_first_letter, surname, domain)
    possible_emails['given_name0|_|surname'] = "%s_%s@%s" % (
        given_name_first_letter, surname, domain)
    possible_emails['given_name|.|surname0'] = "%s.%s@%s" % (
        given_name, surname_first_letter, domain)
    possible_emails['given_name|surname0'] = "%s%s@%s" % (
        given_name, surname_first_letter, domain)
    possible_emails['given_name|-|surname0'] = "%s-%s@%s" % (
        given_name, surname_first_letter, domain)
    possible_emails['given_name|_|surname0'] = "%s_%s@%s" % (
        given_name, surname_first_letter, domain)
    possible_emails['surname|.|given_name0'] = "%s.%s@%s" % (
        surname, given_name_first_letter, domain)
    possible_emails['|surname|given_name0'] = "%s%s@%s" % (
        surname, given_name_first_letter, domain)
    possible_emails['surname|-|given_name0'] = "%s-%s@%s" % (
        surname, given_name_first_letter, domain)
    possible_emails['surname|_|given_name0'] = "%s_%s@%s" % (
        surname, given_name_first_letter, domain)
    possible_emails['surname0|given_name0'] = "%s%s@%s" % (
        surname_first_letter, given_name_first_letter, domain)
    possible_emails['given_name0|surname0'] = "%s%s@%s" % (
        given_name_first_letter, surname_first_letter, domain)
    possible_emails['surname'] = "%s@%s" % (surname, domain)
    possible_emails['given_name'] = "%s@%s" % (given_name, domain)

    for possible_pattern, possible_email in possible_emails.items():
        if possible_email == email:
            pattern = possible_pattern
            return {'pattern': pattern, 'email': possible_email}

    domain_split = email.split('@')
    if len(domain_split):
        email_domain = domain_split[0]
        if surname and given_name and surname in email_domain and given_name in email_domain:
            return {'pattern': '_surname_given_name_', 'email': email}

    return None
def scrape_cook():

    ## This scraper loops through the results txt data (SummaryExport.txt) and matches only with data from cook-IDs.csv.
    ## It only adds in the race_obj if the race name doesn't exist in `added`,
    ## which starts as an empty list. Within that for loop exists another for+if loop that loops through the
    ## `cook_county_results` list and adds the current race's candidate info.

    get_txtfile()

    COUNTY_NAME = "Cook County"
    cook_county_results = []
    added = []

    with open('scrapers/cook-IDs.csv', newline='') as f:
        reader = csv.reader(f)
        cook_info = list(reader)
    with open('scrapers/updated_cook.txt',
              'r') as r:  # should be name of newly-written file
        results_data = r.readlines()

    # This matches results races to dict races by the first seven characters of the record.
    for results_row in results_data:
        current_ID_match = results_row[0:7]  #RESULTS
        for info_line in cook_info:
            full_ID_match = info_line[0][0:7]  #CONTEXT

            if current_ID_match == full_ID_match:

                full_ID = info_line[0]
                race_name = info_line[1].title()
                candidate = info_line[2]
                full_name = pp.parse(
                    candidate,
                    'person')  # uses probablepeople to parse names into a list

                first_name, middle_name, last_name = parse_name(full_name)

                precincts_total = int(results_row[7:11])
                vote_count = int(results_row[11:18])
                precincts_reporting = int(results_row[18:22])
                cand_party = full_ID[22:25]
                ballot_order = int(info_line[0][4:7])

                if race_name not in added:
                    # creates object in format of race object for use in TribPub's Google Sheet
                    race_obj = initialize_race_obj(race_name,
                                                   precincts_reporting,
                                                   precincts_total,
                                                   COUNTY_NAME)
                    cook_county_results.append(race_obj)
                    added.append(race_name)
                else:
                    pass

                for item in cook_county_results:
                    if item['name'] == race_name.title():
                        first_name, middle_name, last_name = parse_name(
                            full_name)

                        item['reporting_units'][0]['candidates'].append({
                            "first_name":
                            first_name,
                            "middle_name":
                            middle_name,
                            "last_name":
                            last_name,
                            "vote_count":
                            int(vote_count),
                            "ballot_order":
                            int(ballot_order)
                        })
                    else:
                        pass
            else:
                pass

    # print(cook_county_results)

    with open('scrapers/cook_data.json', 'w', encoding='utf-8') as f:
        json.dump(cook_county_results, f, ensure_ascii=False, indent=4)

    return cook_county_results


# this should be commented out when running the app
# leave it in if you're just testing the scraper
# scrape_cook()
Esempio n. 17
0
    def handle(self, *args, **options):
        self.spacy_model = spacy.load(settings.SPACY_CUSTOMN_MODEL_FOLDER)

        contacts = DirectoryContact.objects.raw(
            "select id, name, count(1) from directory_contact  "
            "where (first_name is null or last_name is null) and name not in ('', 'null') and name is not null  "
            "group by name  "
            "order by count(1) desc;"
        )
        progress_bar = tqdm(desc="Processing", total=len(contacts))

        for contact in contacts:
            raw_name = contact.name if contact.name else ''
            raw_name = re.sub(r'[a-zA-Z]{1,3}\.', '', raw_name)
            raw_name = re.sub(r"[^a-zA-Z']+", ' ', raw_name)
            # raw_workplace = contact.workplace if contact.workplace else ''
            name_doc = self.spacy_model(raw_name)
            # workplace_doc = self.spacy_model(raw_workplace)
            first_name = None
            last_name = None
            middle_name = None
            organization = None
            title = None

            for ent in name_doc.ents:
                if ent.label_ == 'PERSON':
                    person = ent.text
                    split_name_parts = pp.parse(person, 'person')
                    for split_name_part in split_name_parts:
                        partial_name = re.sub(r"[^a-zA-Z-']+", '', split_name_part[0]).lower()
                        if split_name_part[1] == 'GivenName':
                            first_name = partial_name
                        if split_name_part[1] == 'Surname':
                            last_name = partial_name
                        if split_name_part[1] == 'MiddleName':
                            middle_name = partial_name
                    break

                # if ent.label_ == 'TITLE' and not title:
                #     title = ent.text

            if not first_name:
                split_name_parts = pp.parse(raw_name, 'person')
                for split_name_part in split_name_parts:
                    partial_name = re.sub(r"[^a-zA-Z-']+", '', split_name_part[0]).lower()
                    if split_name_part[1] == 'GivenName':
                        first_name = partial_name
                    if split_name_part[1] == 'Surname':
                        last_name = partial_name
                    if split_name_part[1] == 'MiddleName':
                        middle_name = partial_name

            # for ent in workplace_doc.ents:
            #     if ent.label_ == 'ORG' and not organization:
            #         organization = ent.text
            #     if ent.label_ == 'TITLE' and not title:
            #         title = ent.text

            if (first_name and last_name) or organization or title:
                print("raw_name=%s, first_name=%s, last_name=%s, title=%s, org=%s" %
                      (raw_name, first_name, last_name, title, organization))
                DirectoryContact.objects.filter(
                    # id=contact.id,
                    name=contact.name,
                    # workplace=contact.workplace,
                ).update(
                    first_name=first_name,
                    last_name=last_name,
                    middle_name=middle_name,
                    # organization=organization,
                    title=title,
                )

            progress_bar.update(1)

        print('done batch')
        progress_bar.close()
def scrape_cook():

    ## This scraper loops through the results txt data and matches only with data from cook-IDs.csv.
    ## It only adds in the race_obj if the race name doesn't exist in `added`,
    ## which starts as an empty list. Within that for loop exists another for+if loop that loops through the
    ## `cook_county_results` list and adds the current race's candidate info.

    get_txtfile()

    cook_county_results = []
    added = []

    with open('scrapers/cook_files/cook-IDs.csv', newline='') as f:
        reader = csv.reader(f)
        cook_info = list(reader)
    with open('scrapers/cook_files/updated_cook.txt',
              'r') as r:  # should be name of newly-written file
        results_data = r.readlines()

    # This matches results races to dict races by the first seven characters of the record.
    for results_row in results_data:
        current_ID_match = results_row[0:7]  #RESULTS
        for info_line in cook_info:
            full_ID_match = info_line[0][0:7]  #CONTEXT

            if current_ID_match == full_ID_match:

                full_ID = info_line[0]
                race_name = info_line[1].title()
                candidate = info_line[2]
                full_name = pp.parse(
                    candidate,
                    'person')  # uses probablepeople to parse names into a list

                first_name, middle_name, last_name = parse_name(full_name)

                total_precincts = int(results_row[7:11])
                vote_count = int(results_row[11:18])
                precincts_reporting = int(results_row[18:22])
                if full_ID[22:25] == "DEM" or full_ID[
                        22:25] == "REP" or full_ID[22:25] == "NON":
                    cand_party = full_ID[22:25]
                    amendment = "False"
                else:
                    cand_party = ""
                    amendment = "True"
                ballot_order = int(info_line[0][4:7])

                if race_name not in added:
                    # creates object in format of race object for use in TribPub's Google Sheet
                    race_obj = {
                        "name":
                        race_name.title(),
                        "description":
                        "",
                        "election_date":
                        "2020-11-03",
                        "market":
                        "chinews",
                        "uncontested":
                        False,
                        "amendment":
                        bool(amendment),
                        "state_postal":
                        "IL",
                        "recount":
                        False,
                        "reporting_units": [{
                            "name":
                            "Cook",
                            "level":
                            "county",
                            "district_type":
                            "",
                            "state_postal":
                            "IL",
                            "geo_id":
                            "",
                            "electoral_vote_total":
                            0,
                            "precincts_reporting":
                            int(precincts_reporting),
                            "total_precincts":
                            int(total_precincts),
                            "data_source_update_time":
                            datetime.now(
                                timezone.utc).strftime('%Y-%m-%dT%H:%M:%S%z'),
                            "candidates":
                            []  # creates empty list for candidates info
                        }]
                    }

                    cook_county_results.append(race_obj)
                    added.append(race_name)
                else:
                    pass

                for item in cook_county_results:
                    if item['name'] == race_name.title():
                        item['reporting_units'][0]['candidates'].append({
                            "first_name":
                            first_name,
                            "middle_name":
                            middle_name,
                            "last_name":
                            last_name,
                            "vote_count":
                            int(vote_count),
                            "party":
                            cand_party,
                            "ballot_order":
                            int(ballot_order)
                        })
                    else:
                        pass
            else:
                pass

    # print(cook_county_results)

    with open('scrapers/cook_files/cook_data.json', 'w',
              encoding='utf-8') as f:
        json.dump(cook_county_results, f, ensure_ascii=False, indent=4)

    return cook_county_results


# this should be commented out when running the app
# leave it in if you're just testing the scraper
# scrape_cook()

## This also works if we have issues with the dict for some reason and want to only use SummaryExport.txt.

# with open('SummaryExport.txt','r') as f:
# 	data = f.readlines()[6:]
# 	for row in data:
# 		# splits row by an instance of two or more characters of white space
# 		# https://stackoverflow.com/questions/48917121/split-on-more-than-one-space
# 		row = re.split(r"\s{2,}", row)
# 		if "&" in row[2]:
# 			pass
# 		else:
# 			row_race_name = row[1]
# 			row_geography = row[3]
# 			cand_id = row[0]
# 			total_precincts = int(cand_id[7:11])
# 			vote_count = int(cand_id[11:18])
# 			precincts_reporting = int(cand_id[18:22])
# 			cand_party = cand_id[22:]
# 			if cand_party == "":
# 				amendment = True
# 			else:
# 				amendment = False
# 			ballot_order = int(cand_id[5:7])

# 			# handles splitting names up
# 			row_cand_name = row[2]
# 			print(row_cand_name)
# 			full_name = pp.parse(row_cand_name, 'person') # uses probablepeople to parse names into a list
# 			if len(full_name) == 1:
# 				first_name = ""
# 				middle_name = ""
# 				last_name = full_name[0][0]
# 			elif len(full_name) == 2:
# 				first_name = full_name[0][0]
# 				middle_name = ""
# 				last_name = full_name[1][0]
# 			elif len(full_name) == 3:
# 				first_name = full_name[0][0]
# 				middle_name = full_name[1][0]
# 				last_name = full_name[2][0]
# 			elif len(full_name) == 4:
# 				first_name = full_name[0][0]
# 				middle_name = full_name[1][0]
# 				last_name = full_name[2][0]+" "+full_name[3][0]

# 			if row_race_name not in added:

# 				# creates object in format of race object for use in TribPub's Google Sheet
# 				race_obj = {
# 					"name": row_race_name,
# 					"description": "",
# 					"election_date": "2020-11-03",
# 					"market": "chitrib",
# 					"uncontested": False,
# 					"amendment": amendment, # default, can be changed later
# 					"state_postal": "IL",
# 					"recount": False,
# 					"reporting_units": [
# 					    {
# 					        "name": "Cook County",
# 					        "level": "county",
# 					        "district_type": "",
# 					        "state_postal": "IL",
# 					        "geo_id": "",
# 					        "electoral_vote_total": 0,
# 					        "precincts_reporting": precincts_reporting,
# 					        "total_precincts": total_precincts,
# 					        "data_source_update_time": datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%S%z'),
# 					        "candidates": [] # creates empty list for candidates info
# 					    }
# 					]
# 				}

# 				cook_county_results.append(race_obj)
# 				added.append(row_race_name)
# 			else:
# 				pass

# 			for item in cook_county_results:
# 				if item['name'] == row_race_name:
# 					item['reporting_units'][0]['candidates'].append({
# 						"first_name": first_name,
# 						"middle_name": middle_name,
# 						"last_name": last_name,
# 						"vote_count": vote_count,
# 						"party": cand_party,
# 						"ballot_order": ballot_order
# 					})
# 	# print(cook_county_results)
# 	with open('cook_data.json', 'w', encoding='utf-8') as f:
# 		json.dump(cook_county_results, f, ensure_ascii=False, indent=4)
# 	return cook_county_results
Esempio n. 19
0
def scrape_chicago():

    ## This scraper loops through the results txt data and matches only with data from chicago-IDs.csv.
    ## It only adds in the race_obj if the race name doesn't exist in `added`,
    ## which starts as an empty list. Within that for loop exists another for+if loop that loops through the
    ## `chicago_results` list and adds the current race's candidate info.

    # TODO: REPLACE THIS WITH THE REAL URL ON ELECTION NIGHT
    txt_download = 'https://chicagoelections.gov/ap/summary.txt'
    r = requests.get(txt_download)
    with open('scrapers/chicago_files/chicago-results.txt', 'wb') as f:
        f.write(r.content)

    chicago_results = []
    added = []

    # CONTEXT SHEET - this info not provided during election night
    with open('scrapers/chicago_files/chicago-IDs.csv', newline='') as f:
        reader = csv.reader(f)
        chicago_info = list(reader)

    with open('scrapers/chicago_files/chicago-results.txt',
              'r') as r:  # created from chicago-zeroes-full.txt
        results_data = r.readlines()

    # This matches results races to dict races by the first seven characters of the record.
    for results_row in results_data:
        current_ID_match = results_row[0:7]  #RESULTS

        for info_line in chicago_info:
            context_ID_match = info_line[0][0:7]  #CONTEXT

            if current_ID_match == context_ID_match:

                context_ID = info_line[0]
                race_name = info_line[1].title()

                candidate = info_line[2]
                full_name = pp.parse(
                    candidate,
                    'person')  # uses probablepeople to parse names into a list
                first_name, middle_name, last_name = parse_name(full_name)

                total_precincts = int(results_row[7:11])

                vote_count = int(results_row[11:18])

                precincts_reporting = int(results_row[18:22])

                if context_ID[22:25] == "DEM" or context_ID[
                        22:25] == "REP" or context_ID[22:25] == "NON":
                    cand_party = context_ID[22:25]
                    amendment = "False"
                else:
                    cand_party = ""
                    amendment = "True"

                ballot_order = int(info_line[0][4:7])

                if race_name not in added:
                    race_obj = initialize_race_obj(race_name,
                                                   precincts_reporting,
                                                   total_precincts)

                    chicago_results.append(race_obj)
                    added.append(race_name)
                else:
                    pass

                for item in chicago_results:
                    if item['name'] == race_name.title():
                        item['reporting_units'][0]['candidates'].append({
                            "first_name":
                            first_name,
                            "middle_name":
                            middle_name,
                            "last_name":
                            last_name,
                            "vote_count":
                            int(vote_count),
                            "party":
                            cand_party,
                            "ballot_order":
                            int(ballot_order)
                        })
                    else:
                        pass
            else:
                pass

    with open('scrapers/chicago_files/chicago_data.json',
              'w',
              encoding='utf-8') as f:
        json.dump(chicago_results, f, ensure_ascii=False, indent=4)

    return chicago_results


# this should be commented out when running the app
# leave it in if you're just testing the scraper
# scrape_chicago()
Esempio n. 20
0
def scrape_mchenry():

    COUNTY_NAME = "McHenry"
    # URLs
    MCHENRY_RACE_URL = get_results_url()
    PRECINCTS_URL = get_precincts_url()

    data = requests.get(MCHENRY_RACE_URL).json()
    precincts_data = requests.get(PRECINCTS_URL).json()

    # gets precinct info
    precincts_reporting = precincts_data['settings'][
        'numberofprecinctsreporting']
    precincts_total = precincts_data['settings']['totalprecinctsreporting']

    mchenry_county_results = []

    for datum in data:

        if datum['CAT'] == 'County' and datum['SUBCAT'] == 'Questions':
            options = datum['CH']
            votes = datum['V']
            race_name = datum['C']

            race_obj = initialize_race_obj(datum['C'], precincts_reporting,
                                           precincts_total, COUNTY_NAME)

            for option_index, (option, vote) in enumerate(zip(options, votes)):

                race_obj["reporting_units"][0]['candidates'].append({
                    "first_name":
                    "",
                    "middle_name":
                    "",
                    "last_name":
                    option.title(),
                    "vote_count":
                    int(vote),
                    "ballot_order":
                    int(option_index + 1)
                })

            mchenry_county_results.append(race_obj)

        elif datum['CAT'] == "County" and datum['SUBCAT'] != "Questions":
            candidates = datum['CH']
            cand_votes = datum['V']
            cand_parties = datum['P']

            race_name = datum['C']

            race_obj = initialize_race_obj(datum['C'], precincts_reporting,
                                           precincts_total, COUNTY_NAME)

            for cand_index, (candidate, cand_vote, cand_party) in enumerate(
                    zip(candidates, cand_votes, cand_parties)):
                full_name = pp.parse(
                    candidate,
                    'person')  # uses probablepeople to parse names into a list
                first_name, middle_name, last_name = parse_name(full_name)

                race_obj["reporting_units"][0]['candidates'].append({
                    "first_name":
                    first_name,
                    "middle_name":
                    middle_name,
                    "last_name":
                    last_name,
                    "vote_count":
                    int(cand_vote),
                    "party":
                    cand_party,
                    "ballot_order":
                    int(cand_index + 1)
                })

            mchenry_county_results.append(race_obj)

    with open('scrapers/mchenry_data.json', 'w', encoding='utf-8') as f:
        json.dump(mchenry_county_results, f, ensure_ascii=False, indent=4)

    return mchenry_county_results


# this should be commented out when running the app
# leave it in if you're just testing the scraper
# scrape_mchenry()
Esempio n. 21
0
def lookupPeople(name):
    parsedName = probablepeople.parse(name)
    return parsedName
def scrape_lake():

    COUNTY_NAME = "Lake County"
    # sets URLs
    LAKE_RACE_URL = get_results_url()
    PRECINCTS_URL = get_precincts_url()

    # gets data
    data = requests.get(LAKE_RACE_URL).json()
    precincts_data = requests.get(PRECINCTS_URL).json()

    # gets precinct info
    precincts_reporting = precincts_data['settings'][
        'numberofprecinctsreporting']
    precincts_total = precincts_data['settings']['totalprecinctsreporting']

    # creates empty list for results info
    lake_county_results = []

    for datum in data:
        race_name = datum['C']
        candidates = datum['CH']
        cand_votes = datum['V']
        cand_parties = datum['P']

        race_obj = initialize_race_obj(datum['C'], precincts_reporting,
                                       precincts_total, COUNTY_NAME)

        for cand_index, (candidate, cand_vote, cand_party) in enumerate(
                zip(candidates, cand_votes, cand_parties)):
            full_name = pp.parse(
                candidate,
                'person')  # uses probablepeople to parse names into a list
            first_name, middle_name, last_name = parse_name(full_name)

            # appends to candidates list
            race_obj["reporting_units"][0]['candidates'].append({
                "first_name":
                first_name,
                "middle_name":
                middle_name,
                "last_name":
                last_name,
                "vote_count":
                int(cand_vote),
                "ballot_order":
                int(cand_index + 1)
            })
            # print(race_obj)

        lake_county_results.append(race_obj)

        # print(lake_county_results)

    with open('scrapers/lake_data.json', 'w', encoding='utf-8') as f:
        json.dump(lake_county_results, f, ensure_ascii=False, indent=4)

    return lake_county_results


# this should be commented out when running the app
# leave it in if you're just testing the scraper
# scrape_lake()
Esempio n. 23
0
def split_single_name_field(data_row, attachment):
    """
    split names into two
    fields: first and last, by comma
    or by ordering
    """
    if data_row['last_name'] and not data_row['first_name']:
        # TODO: this totally belongs elsewhere in transformations
        try:
            data_row['last_name'] = data_row['last_name'].decode('utf-8')
            data_row['first_name'] = data_row['first_name'].decode('utf-8')
        except:
            print 'failed to decode name to utf-8'
        # just testing out probablepeople
        if test_pp and '/' not in data_row[
                'last_name'] and attachment.id not in first_name_first and attachment.id not in last_name_first:
            parsed_name = probablepeople.parse(data_row['last_name'])
            last_names = [x[0] for x in parsed_name if x[1] == 'Surname']
            if last_names:
                last_name = last_names[0]
            else:
                last_name = ''
            first_names = [x[0] for x in parsed_name if x[1] == 'GivenName']
            if first_names:
                first_name = first_names[0]
            else:
                first_name = ''
            middle_inits = [
                x[0] for x in parsed_name if x[1] == 'MiddleInitial'
            ]
            if middle_inits:
                middle_init = middle_inits[0]
            else:
                middle_init = ''
            suffixes = [
                x[0] for x in parsed_name if x[1] == 'SuffixGenerational'
            ]
            if suffixes:
                suffix = suffixes[0]
            else:
                suffix = ''
            data_row['last_name'] = ' '.join([last_name, suffix])
            data_row['first_name'] = ' '.join([first_name, middle_init])
            #test_file = open('test_pp.txt','a')
            #test_file.write(data_row['first_name'] + '    ' + data_row['last_name'] + '\n')
            #test_file.close()
            if data_row['last_name'].strip() and data_row['first_name'].strip(
            ):
                return data_row

        comma_delimited = data_row['last_name'].split(',')
        if len(comma_delimited) == 2:
            data_row['last_name'] = comma_delimited[0]
            data_row['first_name'] = comma_delimited[1]
        elif len(comma_delimited) > 2:
            data_row['first_name'] = comma_delimited[-1].lstrip()
            data_row['last_name'] = ' '.join(comma_delimited[:-1])
        else:
            space_delimited = data_row['last_name'].split(' ')
            #hack
            if '/' in data_row['last_name']:
                space_delimited = data_row['last_name'].split('/')
            if len(space_delimited) > 1:
                data_row = set_order_single_name(data_row, attachment)
                if not data_row['first_name']:
                    msg = ' fuuuuuuuuuuuuck ... how did they split this up? better special case it ... \n'
                    msg += data_row['last_name'] + '\n'
                    msg += ' '.join(['attachment.id:', str(attachment.id)])
                    raw_input(msg)
    return data_row
def scrape_kane():

    COUNTY_NAME = "Kane County"
    kane_data = get_results_url()

    # creates empty list for results info
    kane_county_results = []

    race_data = kane_data.findAll(
        "h2"
    )  # h2 gets each race name, findPrevious/Next/Children is based on this
    for race in race_data:
        candidates = []
        votes = []

        # finds precincts reporting and total precincts
        finding_precincts_info = race.findPrevious('td')
        precincts_info = finding_precincts_info.findPrevious('td')
        precincts = list(map(int, re.findall(r'\d+', str(
            precincts_info))))  # gets integers from precincts line, makes list
        precincts_reporting = precincts[0]
        precincts_total = precincts[1]
        # print(precincts_reporting, precincts_total)

        cands = race.findNext('table')
        names = cands.findChildren('td')
        for name in names:
            name = str(name)
            if name.startswith('<td>'):
                # splits may be necessary to pinpoint just name
                # appends each name to candidates list
                if '(Write-In)' in name:
                    name = name.split('<b>', 2)
                    name_split = name[0]
                    cand_name_split = name_split.split('>', 2)
                    cand_name = cand_name_split[1]
                    candidates.append(cand_name)
                    # print('appended', cand_name)
                elif '(Independent)' in name or '(Democratic)' in name or '(Republican)' in name:
                    candidate_split = name.rsplit('(', 1)
                    candidate = candidate_split[0]
                    cand_name_split = candidate.split('>', 1)
                    cand_name = cand_name_split[1]
                    candidates.append(cand_name)
                    # print(cand_name)
                else:
                    name_split = name.split('>', 2)
                    name_split = str(name_split[1])
                    final_name = name_split.split('</', 2)
                    cand_name = final_name[0]
                    candidates.append(cand_name)
                    # print('appended', cand_name)
            if '<b>' in name:
                name_split = name.split('</b>', 2)
                name_split = str(name_split[0])
                final_name = name_split.split('<b>', 2)
                if '%' not in final_name[1]:
                    # separates vote percentages from vote counts
                    # appends votes to votes list
                    cand_votes = final_name[1]
                    votes.append(cand_votes)
                    # print('appended', cand_votes)

        race = str(race)
        race_split = race.split('<br/>', 2)
        race_split = race_split[0]
        final_race_name = race_split.split('>', 2)
        race_name = final_race_name[1]

        # creates object in format of race object for use in TribPub's Google Sheet
        race_obj = initialize_race_obj(race_name, precincts_reporting,
                                       precincts_total, COUNTY_NAME)

        for option_index, (candidate,
                           vote) in enumerate(zip(candidates, votes)):
            full_name = pp.parse(
                candidate,
                'person')  # uses probablepeople to parse names into a list
            first_name, middle_name, last_name = parse_name(full_name)

            race_obj["reporting_units"][0]['candidates'].append({
                "first_name":
                first_name,
                "middle_name":
                middle_name,
                "last_name":
                last_name,
                "vote_count":
                int(vote),
                "ballot_order":
                int(option_index + 1)
            })

        kane_county_results.append(race_obj)

    with open('scrapers/kane_data.json', 'w', encoding='utf-8') as f:
        json.dump(kane_county_results, f, ensure_ascii=False, indent=4)

    return kane_county_results


# this should be commented out when running the app
# leave it in if you're just testing the scraper
# scrape_kane()
    ('Monique van de Ven', ['GivenName', 'Surname', 'Surname', 'Surname']),
    ('Robert J. Van de Graaff', ['GivenName', 'MiddleInitial', 'Surname', 'Surname', 'Surname']),
    ('a. b. smith', ['FirstInitial', 'MiddleInitial', 'Surname']),
    ('a. bob smith', ['FirstInitial', 'MiddleName', 'Surname']),
    ('bob a and linda b belcher', ['GivenName', 'MiddleInitial', 'And', 'GivenName', 'MiddleInitial', 'Surname']),
    ('a. smith', ['FirstInitial', 'Surname']),
    ('a smith', ['FirstInitial', 'Surname']),
    ('a smith d.o.', ['FirstInitial', 'Surname', 'SuffixOther']),
    ('a smith iii', ['FirstInitial', 'Surname', 'SuffixGenerational']),
    ('a smith DO', ['FirstInitial', 'Surname', 'SuffixOther']),
    ('belcher, bob b', ['Surname', 'GivenName', 'MiddleInitial']),
    ('belcher, bob b jr', ['Surname', 'GivenName', 'MiddleInitial', 'SuffixGenerational']),
    ('Belcher, Bob B. IV', ['Surname', 'GivenName', 'MiddleInitial', 'SuffixGenerational']),
]

failed = 0
for string_tuple in test_strings :
    labels_true = string_tuple[1]
    parsed = parse(string_tuple[0])
    labels_pred = [ token[1] for token in parsed ]
    if labels_pred == labels_true:
        print(string_tuple[0], "...ok")
    else:
        failed += 1
        print("*"*40)
        print(string_tuple[0], "...INCORRECT PARSING")
        print("pred: ", labels_pred)
        print("true: ", labels_true)
        print("-"*40)

print("Failed", failed, "out of", len(test_strings), "strings")
Esempio n. 26
0
def sppw(text, port=False, use_pp=True, host='localhost'):
    """
    Standalone function to wrap and combine Stanford NER and probablepeople.

    >>> res = sppw("Willem-Alexander (Dutch: [ˈʋɪləm aːlɛkˈsɑndər]; Willem-Alexander Claus George Ferdinand; born 27 April 1967) is the King of the Netherlands.", 9991)
    >>> from pprint import pprint;pprint(res)
    {'ners': [{'string': 'Willem-Alexander', 'tag': 'person'},
              {'string': 'Willem-Alexander Claus George Ferdinand',
               'tag': 'person'},
              {'string': 'Netherlands', 'tag': 'location'}],
     'pp': [[('Willem-Alexander', 'GivenName')],
            [('Willem-Alexander', 'CorporationName'),
             ('Claus', 'CorporationName'),
             ('George', 'CorporationName'),
             ('Ferdinand', 'CorporationName')]],
     'raw_ners': [{'string': 'Willem-Alexander', 'tag': 'person'},
                  {'string': 'Willem-Alexander Claus George Ferdinand',
                   'tag': 'person'},
                  {'string': 'Netherlands', 'tag': 'location'}],
     'raw_response': u'<PERSON>Willem-Alexander</PERSON> (Dutch: [\u02c8\u028b\u026al\u0259m a\u02d0l\u025bk\u02c8s\u0251nd\u0259r]; <PERSON>Willem-Alexander Claus George Ferdinand</PERSON>; born 27 April 1967) is the King of the <LOCATION>Netherlands</LOCATION>.'}
    >>> res = sppw("Prof. Albert Einstein vertoeft op het oogenblik te Londen, en gisteravond was hij in Savoy Hotel eeregast aan een diner, gegeven door de Ort and Oze Societies. De voorzitter van de Engelsche sectie dier Vereeniging is Lord • Rothschild ; de voorzitter van de Duitsche sectie is prof. Einstein.  Lord Rothschild presideerde het diner; aan zijn rechterhand zat de beroemdste geleerde van onzen tyd, aan zijn linkerhand de beroemdste dichter, Bernard Shaw. Rechts van Einstein zat Wells.  Het was een gastmaal voor het intellect en z|jn dames.  Ik wil er geen verslag van geven, maar my bepalen tot enkele aanteekeningen.", 9993, True)
    >>> from pprint import pprint;pprint(res)
    {'ners': [{'string': 'Albert Einstein vertoeft', 'tag': 'pers'},
              {'string': 'Londen', 'tag': 'org'},
              {'string': 'gisteravond was hij in Savoy Hotel eeregast',
               'tag': 'org'},
              {'string': 'Ort and Oze Societies', 'tag': 'org'},
              {'string': 'Engelsche', 'tag': 'otros'},
              {'string': u'Vereeniging is Lord \u2022 Rothschild', 'tag': 'org'},
              {'string': 'Duitsche', 'tag': 'otros'},
              {'string': 'Einstein', 'tag': 'pers'},
              {'string': 'Rothschild', 'tag': 'org'},
              {'string': 'zat', 'tag': 'org'},
              {'string': 'Bernard Shaw', 'tag': 'pers'},
              {'string': 'Rechts van Einstein zat Wells', 'tag': 'pers'},
              {'string': 'Het', 'tag': 'pers'}],
     'pp': [[('Albert', 'GivenName'),
             ('Einstein', 'Surname'),
             ('vertoeft', 'Surname')],
            [('Einstein', 'Surname')],
            [('Bernard', 'GivenName'), ('Shaw', 'Surname')],
            [('Rechts', 'GivenName'),
             ('van', 'Surname'),
             ('Einstein', 'Surname'),
             ('zat', 'GivenName'),
             ('Wells', 'Surname')],
            [('Het', 'ShortForm')]],
     'raw_ners': [{'string': 'Albert Einstein vertoeft', 'tag': 'pers'},
                  {'string': 'Londen', 'tag': 'org'},
                  {'string': 'gisteravond was hij in Savoy Hotel eeregast',
                   'tag': 'org'},
                  {'string': 'Ort and Oze Societies', 'tag': 'org'},
                  {'string': 'Engelsche', 'tag': 'otros'},
                  {'string': u'Vereeniging is Lord \u2022 Rothschild',
                   'tag': 'org'},
                  {'string': 'Duitsche', 'tag': 'otros'},
                  {'string': 'Einstein', 'tag': 'pers'},
                  {'string': 'Rothschild', 'tag': 'org'},
                  {'string': 'zat', 'tag': 'org'},
                  {'string': 'Bernard Shaw', 'tag': 'pers'},
                  {'string': 'Rechts van Einstein zat Wells', 'tag': 'pers'},
                  {'string': 'Het', 'tag': 'pers'}],
     'raw_response': u'Prof. <PERS>Albert Einstein vertoeft</PERS> op het oogenblik te <ORG>Londen</ORG>, en <ORG>gisteravond was hij in Savoy Hotel eeregast</ORG> aan een diner, gegeven door de <ORG>Ort and Oze Societies</ORG>. De voorzitter van de <OTROS>Engelsche</OTROS> sectie dier <ORG>Vereeniging is Lord \u2022 Rothschild</ORG> ; de voorzitter van de <OTROS>Duitsche</OTROS> sectie is prof. <PERS>Einstein</PERS>.  Lord <ORG>Rothschild</ORG> presideerde het diner; aan zijn rechterhand <ORG>zat</ORG> de beroemdste geleerde van onzen tyd, aan zijn linkerhand de beroemdste dichter, <PERS>Bernard Shaw</PERS>. <PERS>Rechts van Einstein zat Wells</PERS>.  <PERS>Het</PERS> was een gastmaal voor het intellect en z|jn dames.  Ik wil er geen verslag van geven, maar my bepalen tot enkele aanteekeningen.'}
    >>> res = sppw("Prof. Albert Einstein vertoeft op het oogenblik te Londen, en gisteravond was hij in Savoy Hotel eeregast aan een diner, gegeven door de Ort and Oze Societies. De voorzitter van de Engelsche sectie dier Vereeniging is Lord • Rothschild ; de voorzitter van de Duitsche sectie is prof. Einstein.  Lord Rothschild presideerde het diner; aan zijn rechterhand zat de beroemdste geleerde van onzen tyd, aan zijn linkerhand de beroemdste dichter, Bernard Shaw. Rechts van Einstein zat Wells.  Het was een gastmaal voor het intellect en z|jn dames.  Ik wil er geen verslag van geven, maar my bepalen tot enkele aanteekeningen.", 9993, True)
    >>> from pprint import pprint;pprint(res)
    {'ners': [{'string': 'Albert Einstein vertoeft', 'tag': 'pers'},
              {'string': 'Londen', 'tag': 'org'},
              {'string': 'gisteravond was hij in Savoy Hotel eeregast',
               'tag': 'org'},
              {'string': 'Ort and Oze Societies', 'tag': 'org'},
              {'string': 'Engelsche', 'tag': 'otros'},
              {'string': u'Vereeniging is Lord \u2022 Rothschild', 'tag': 'org'},
              {'string': 'Duitsche', 'tag': 'otros'},
              {'string': 'Einstein', 'tag': 'pers'},
              {'string': 'Rothschild', 'tag': 'org'},
              {'string': 'zat', 'tag': 'org'},
              {'string': 'Bernard Shaw', 'tag': 'pers'},
              {'string': 'Rechts van Einstein zat Wells', 'tag': 'pers'},
              {'string': 'Het', 'tag': 'pers'}],
     'pp': [[('Albert', 'GivenName'),
             ('Einstein', 'Surname'),
             ('vertoeft', 'Surname')],
            [('Einstein', 'Surname')],
            [('Bernard', 'GivenName'), ('Shaw', 'Surname')],
            [('Rechts', 'GivenName'),
             ('van', 'Surname'),
             ('Einstein', 'Surname'),
             ('zat', 'GivenName'),
             ('Wells', 'Surname')],
            [('Het', 'ShortForm')]],
     'raw_ners': [{'string': 'Albert Einstein vertoeft', 'tag': 'pers'},
                  {'string': 'Londen', 'tag': 'org'},
                  {'string': 'gisteravond was hij in Savoy Hotel eeregast',
                   'tag': 'org'},
                  {'string': 'Ort and Oze Societies', 'tag': 'org'},
                  {'string': 'Engelsche', 'tag': 'otros'},
                  {'string': u'Vereeniging is Lord \u2022 Rothschild',
                   'tag': 'org'},
                  {'string': 'Duitsche', 'tag': 'otros'},
                  {'string': 'Einstein', 'tag': 'pers'},
                  {'string': 'Rothschild', 'tag': 'org'},
                  {'string': 'zat', 'tag': 'org'},
                  {'string': 'Bernard Shaw', 'tag': 'pers'},
                  {'string': 'Rechts van Einstein zat Wells', 'tag': 'pers'},
                  {'string': 'Het', 'tag': 'pers'}],
     'raw_response': u'Prof. <PERS>Albert Einstein vertoeft</PERS> op het oogenblik te <ORG>Londen</ORG>, en <ORG>gisteravond was hij in Savoy Hotel eeregast</ORG> aan een diner, gegeven door de <ORG>Ort and Oze Societies</ORG>. De voorzitter van de <OTROS>Engelsche</OTROS> sectie dier <ORG>Vereeniging is Lord \u2022 Rothschild</ORG> ; de voorzitter van de <OTROS>Duitsche</OTROS> sectie is prof. <PERS>Einstein</PERS>.  Lord <ORG>Rothschild</ORG> presideerde het diner; aan zijn rechterhand <ORG>zat</ORG> de beroemdste geleerde van onzen tyd, aan zijn linkerhand de beroemdste dichter, <PERS>Bernard Shaw</PERS>. <PERS>Rechts van Einstein zat Wells</PERS>.  <PERS>Het</PERS> was een gastmaal voor het intellect en z|jn dames.  Ik wil er geen verslag van geven, maar my bepalen tot enkele aanteekeningen.'}
    """
    for s in ("\f", "\n", "\r", "\t", "\v"):  # strip whitespaces
        text = text.replace(s, '')
    text += "\n"  # ensure end-of-line

    with _tcpip4_socket(host, port) as s:
        if not isinstance(text, bytes):
            text = text.encode('utf-8')
        s.sendall(text)

        tagged_text = s.recv(10*len(text))

    result = tagged_text.decode("utf-8")

    ner = {"raw_response": result,
           "raw_ners": [],
           "ners": []}

    result = "<xml>%s</xml>" % result
    res = lxml.html.fromstring(result)

    for item in res.iter():
        if item.tag == 'xml':
            continue
        ner["raw_ners"].append({"string": item.text,
                                "tag": item.tag})

    counter_ners = 0
    ners = []
    for item in ner["raw_ners"]:
        if item.get("tag")[0] == 'i':
            if counter_ners and len(ners) >= counter_ners - 1:
                ners[counter_ners - 1]["string"] += ' ' + item.get("string")
        else:
            tag = item.get("tag")
            if "-" in tag:
                tag = tag.split('-')[1]
            ners.append({"string": item.get("string"),
                         "tag": tag})
            counter_ners += 1
    ner["ners"] = ners

    # Apply probablepeople / (parse and tag)
    if not use_pp:
        ners["pp"] = None
        return ner

    pp = []
    for item in ners:
        # Loop over the Stanford NER (per/ person) results,
        # and apply probablepeople, which raises when fails, (so try).
        if "per" in item["tag"].lower():
            try:
                result = parse(item.get('string'))
            except:
                log.error("Could not run probablepeople")

            if result:
                result = parse(item["string"])
                pp.append(result)
    ner["pp"] = pp
    return ner
Esempio n. 27
0
def lookupPeople(name):
	parsedName = probablepeople.parse(name)
	return parsedName