Ejemplo n.º 1
0
def merge_names_from_json_file(json_name_file, people_db_pickle_file):
    """
    Creates a people db from reading json file (dict of raw names and counts) and merges people
    in it. Stores people db in a pickle file
    :param json_name_file: Path to json file
    :param people_db_pickle_file: Path for output pickle file of the created PeopleDB
    :return:
    """

    with open(json_name_file, 'r') as infile:
        name_dict = json.load(infile)

    initial_time = time.time()

    # add everyone to a PeopleDatabase
    people_db = PeopleDatabase()
    for name in name_dict:
        if name_dict[name] >= 3:
            people_db.add_person_raw(name_raw=name, count=name_dict[name])

    print("Length: ", len(people_db))

    # then merge the duplicate / similar names
    people_db.create_positions_csv()
    people_db.merge_duplicates()

    people_db.store_to_disk(people_db_pickle_file)
    print("Merging names took", time.time() - initial_time)
Ejemplo n.º 2
0
def generate_network_whole_industry():
    """
    Generate a network where the nodes are not people but companies

    :return:
    """

    # Load people db
    people_db_path = Path('..', 'data', 'network_generation',
                          'people_db_1970s.pickle')
    people_db = PeopleDatabase()
    people_db.load_from_disk(Path(people_db_path))

    # load the whole 1970s network
    network = get_network_of_1970s_nodes_and_edges()
    edges = network['edges']

    org_counter = Counter()
    connection_counter = Counter()

    # first identify all the primary edges including at least one person from center_people
    for edge in edges.values():
        person1, person2 = edge['edge']
        p1m = person1.most_likely_position
        p2m = person2.most_likely_position
        if (person1 and person2 and p1m != p2m
                and p1m != 'no positions available'
                and p2m != 'no positions available'):
            org_counter[p1m] += edge['count']
            org_counter[p2m] += edge['count']
            connection_counter[tuple(sorted((p1m, p2m)))] += edge['count']

    nodes = []
    edges = []
    for org, org_count in org_counter.most_common():
        nodes.append({
            'name': org,
            'docs': org_count,
            'words': 0,
            'affiliation': 'test'
        })
    for edge, edge_count in connection_counter.most_common():
        edges.append({
            'node1': edge[0],
            'node2': edge[1],
            'docs': edge_count,
            'words': 0
        })

    store_network_for_visualization(nodes,
                                    edges,
                                    center_names=[],
                                    network_name='industry',
                                    file_name='whole_industry.json')
Ejemplo n.º 3
0
def create_db_of_1970s_docs_from_csv():  # pylint: disable=C0103
    """
    We have this strange 1970s db from November 2019 but I don't know how it was created.
    This script simply uses the docs_1970s_all.csv to create a people_db using the info found in
    those documents.

    :return:
    """

    print("Generating new 1970s People DB")

    df = pd.read_csv(DOCS_CSV_PATH).fillna('')  # pylint: disable=C0103
    people_db = PeopleDatabase()

    counters = {
        'valid': Counter(),  # valid person
        'organization_from_person':
        Counter(),  # valid organizations extracted from person col
        'organization_from_org':
        Counter(),  # valid organizations extracted from org col
        'organization_invalid':
        Counter(),  # invalid organizations from org col
        'invalid': Counter(),  # not a valid person
        'error': Counter(),  # threw an error
    }

    for idx, doc in df.iterrows():  # iterate over all documentsf
        if idx % 1000 == 0:
            print(idx)

        doc_authors, doc_author_orgs = parse_authors_or_recipients_of_doc(
            'authors', doc, counters, people_db)
        doc_recipients, doc_recipient_orgs = parse_authors_or_recipients_of_doc(
            'recipients', doc, counters, people_db)

        doc_author_orgs += parse_au_or_rc_organizations_of_doc(
            'authors', doc, counters, people_db)
        doc_recipient_orgs += parse_au_or_rc_organizations_of_doc(
            'recipients', doc, counters, people_db)

        for person in doc_authors:
            people_db.add_person_raw(name_raw=person,
                                     position=Counter(doc_author_orgs))
        for person in doc_recipients:
            people_db.add_person_raw(name_raw=person,
                                     position=Counter(doc_recipient_orgs))

    len_before_merge = len(people_db)
    people_db.merge_duplicates()
    print("before", len_before_merge, ". after", len(people_db))

    people_db.store_to_disk(PEOPLE_DB_PATH)
Ejemplo n.º 4
0
def search_possible_matches(name, people_db=None):
    """
    Search for possible alias matches given a name
    :param name:
    :param people_db:
    :return: Counter
    """

    if not people_db:
        people_db = PeopleDatabase()
        people_db.load_from_disk(Path(PEOPLE_DB_PATH))

    possible_matches = Counter()

    for person in people_db._alias_to_person_dict:  # pylint: disable=W0212
        person_db = people_db.get_person_from_alias(name)
        if person_db:
            person_obj = people_db.get[person]
            possible_matches[person_obj] = person_obj.count

    return possible_matches
Ejemplo n.º 5
0
 def setUp(self):
     # create a small PeopleDatabase for testing
     test_peopledb = PeopleDatabase()
     test_peopledb.add_person_raw("Dunn, WL", 2)
     test_peopledb.add_person_raw("Dunn, William L", 4)
     test_peopledb.add_person_raw("TEAGUE CE JR", 3)
     test_peopledb.add_person_raw("TEMKO SL, COVINGTON AND BURLING", 5)
     test_peopledb.add_person_raw("TEMKO SL, COVINGTON BURLING", 3)
     test_peopledb.add_person_raw("TEMKO SL, COVINGTON BURLING", 3)
     # merge duplicate people
     test_peopledb.merge_duplicates(print_merge_results_for_name=None)
     # store it as a picke file
     self.test_peopledb_pickle = Path(DATA_PATH, 'django', 'test_peopledb.pickle')
     test_peopledb.store_to_disk(self.test_peopledb_pickle)
     # file path to test csv file for docs
     self.test_docs_csv = Path(DATA_PATH, "django", "test_import_docs.csv")
Ejemplo n.º 6
0
def get_network_of_1970s_nodes_and_edges():  # pylint: disable=C0103,R0914
    """
    Get or create a network of nodes and edges based on the 1970s people database

    :return:
    """

    try:
        with open(NETWORK_PATH, 'rb') as infile:
            return pickle.load(infile)
    except FileNotFoundError:

        people_db = PeopleDatabase()
        try:
            people_db.load_from_disk(PEOPLE_DB_PATH)
        except FileNotFoundError:
            create_db_of_1970s_docs_from_csv()
            people_db.load_from_disk(PEOPLE_DB_PATH)

        df = pd.read_csv(DOCS_CSV_PATH).fillna('')  # pylint: disable=C0103

        nodes = {}
        edges = {}
        counters = {
            'valid': Counter(),  # valid person
            'organization_from_person':
            Counter(),  # valid organizations extracted from person col
            'organization_from_org':
            Counter(),  # valid organizations extracted from org col
            'organization_invalid':
            Counter(),  # invalid organizations from org col
            'invalid': Counter(),  # not a valid person
            'error': Counter(),  # threw an error
        }

        for idx, doc in df.iterrows():  # iterate over all documentsf
            if idx % 1000 == 0:
                print(idx)

            doc_authors, _ = parse_authors_or_recipients_of_doc(
                'authors', doc, counters, people_db)
            doc_recipients, _ = parse_authors_or_recipients_of_doc(
                'recipients', doc, counters, people_db)

            d_authors = []
            for author in doc_authors:
                author_person = people_db.get_person_from_alias(author)
                if author_person:
                    d_authors.append(author_person)
                else:
                    print("could not find", author)
            doc_authors = d_authors

            d_recipients = []
            for recipient in doc_recipients:
                recipient_person = people_db.get_person_from_alias(recipient)
                if recipient_person:
                    d_recipients.append(recipient_person)
                else:
                    print("Could not find", recipient)
            doc_recipients = d_recipients

            for author in doc_authors:
                author.docs_authored.append(doc)
                if author in nodes:
                    nodes[author]['count_authored'] += 1
                else:
                    embed()
                    nodes[author] = {
                        'person': author,
                        'docs_authored': {},
                        'count_received': 0
                    }

            for recipient in doc_recipients:
                if recipient in nodes:
                    nodes[recipient]['count_received'] += 1
                else:
                    nodes[recipient] = {
                        'person': recipient,
                        'count_authored': 0,
                        'count_received': 1
                    }

            for author in doc_authors:
                for recipient in doc_recipients:
                    edge = tuple(sorted([author, recipient]))
                    if edge in edges:
                        edges[edge]['count'] += 1
                    else:
                        edges[edge] = {'edge': edge, 'count': 1}

        with open(NETWORK_PATH, 'wb') as out:
            network = {'nodes': nodes, 'edges': edges}
            pickle.dump(network, out)

        return get_network_of_1970s_nodes_and_edges()
Ejemplo n.º 7
0
def generate_people_network(
        names,
        network_name,
        max_number_of_nodes=100,  # pylint: disable=R0914
        include_2nd_degree_connections=False):
    """
    Generate the network of one or multiple people. The resulting json is stored in
    backend/data
    :param names: list
    :param network_name: str
    :param max_number_of_nodes: int
    :return:
    """
    if include_2nd_degree_connections:
        network_name += '_including_2nd_degree_edges'

    # Load people db
    people_db = PeopleDatabase()
    people_db.load_from_disk(Path(PEOPLE_DB_PATH))

    # initialize the center group of people
    center_people = set()
    for name in names:
        db_person = people_db.get_person_from_alias(name)
        if db_person:
            center_people.add(db_person)
        else:
            print(f'Could not find {name}. Possible candidates: ')
            possible_matches = search_possible_matches(name[:5], people_db)
            for result in possible_matches.most_common(5):
                print(result)
            raise KeyError

    # load the whole 1970s network
    network = get_network_of_1970s_nodes_and_edges()
    edges = network['edges']

    nodes_temp = Counter()
    # edges_out = []
    nodes_out = []

    center_person_doc_counter = Counter()

    # first identify all the primary edges including at least one person from center_people
    for idx, edge in enumerate(edges.values()):

        person1 = people_db.get_person_from_alias(
            edge['edge'][0].aliases.most_common(1)[0][0])
        person2 = people_db.get_person_from_alias(
            edge['edge'][1].aliases.most_common(1)[0][0])

        if not person1 or not person2:
            embed()
        if ((person1 in center_people or person2 in center_people) and  # pylint: disable=R0916
            (person1.first != ''
             or person1.most_likely_position != 'no positions available') and
            (person2.first != ''
             or person2.most_likely_position != 'no positions available') and
            (person1.full_name not in NAMES_TO_SKIP) and
            (person2.full_name not in NAMES_TO_SKIP)):
            nodes_temp[person1] += edge['count']
            nodes_temp[person2] += edge['count']

            if person1 in center_people:
                center_person_doc_counter[person1] += 1
            if person2 in center_people:
                center_person_doc_counter[person2] += 1

    print("showing number of documents per central person")
    for person, count in center_person_doc_counter.most_common():
        print(count, person.full_name)
    if len(center_person_doc_counter) != len(center_people):
        raise ValueError(
            "Found 0 documents for at least one person. embed here and investigate!"
        )

    # store all people in the network to be displayed in their own network
    new_people_db = PeopleDatabase()
    for node, node_count in nodes_temp.most_common(max_number_of_nodes):
        print("\n", node_count, "\n", node)
        node.count = node_count
        new_people_db.people.add(node)
    new_people_db.generate_alias_to_person_dict()
    new_people_db.merge_duplicates(manual_merge=True)

    for node in sorted(new_people_db.people, key=lambda x: x.count)[::-1]:
        nodes_out.append({
            'name': node.full_name,
            'docs': nodes_temp[node],
            'words': 0,
            'affiliation': node.most_likely_position
        })

    edges_out = []
    for edge in edges.values():
        # with additional merges, the people in the db have changed -> we need to look them
        # up again via one of their aliases.
        person1 = new_people_db.get_person_from_alias(
            edge['edge'][0].aliases.most_common(1)[0][0])
        person2 = new_people_db.get_person_from_alias(
            edge['edge'][1].aliases.most_common(1)[0][0])

        if person1 and person2:
            if (person1 in center_people or person2 in center_people
                    or (include_2nd_degree_connections and edge['count'] > 5)):
                edges_out.append({
                    'node1': person1.full_name,
                    'node2': person2.full_name,
                    'docs': edge['count'],
                    'words': 0
                })
                if edge['count'] == 0:
                    raise ValueError("count of edge should not be zero.")

    store_network_for_visualization(nodes_out,
                                    edges_out,
                                    center_names=names,
                                    network_name=f'person_{network_name}',
                                    file_name=f'person_{network_name}.json')