Ejemplo n.º 1
0
def test_collection_with_duplicated(filename_single_document,
                                    filename_many_documents):
    collection = CollectionLazy.from_filenames(filename_single_document,
                                               filename_single_document,
                                               filename_single_document)
    assert len(list(collection.files)) == 3
    assert len(list(collection.articles)) == 1

    collection = CollectionLazy.from_filenames(filename_many_documents,
                                               filename_many_documents,
                                               filename_many_documents)
    assert len(list(collection.files)) == 3
    assert len(list(collection.articles)) == 500
Ejemplo n.º 2
0
def load(collection: CollectionLazy) -> Iterator[ig.Graph]:
    """
    Takes in a list of ISI files (or filenames) and spits out an iterator over their
    connected components.

    This function might be responsible for our filters.
    :param isi_files: List of files.
    :return: Filtered connected components.
    """
    vertices = {}
    pair_labels = []
    for article, reference in collection.citation_pairs(
        pair_parser=collection.metadata_pair_parser
    ):
        art_label, vertices[art_label] = article
        ref_label, vertices[ref_label] = reference
        pair_labels.append((art_label, ref_label))

    graph = ig.Graph(directed=True)
    for label, attrs in vertices.items():
        graph.add_vertex(name=label, label=label, **attrs)

    graph.add_edges(pair_labels)
    graph = graph.simplify()
    valid_vs = graph.vs.select(lambda v: v["label"].lower() != "null").indices
    graph = graph.subgraph(valid_vs)
    valid_vs = graph.vs.select(
        lambda v: v.indegree() != 1 or v.outdegree() != 0
    ).indices
    graph = graph.subgraph(valid_vs)
    for component in graph.clusters(MODE_WEAK):
        subgraph = graph.subgraph(component)
        if len(subgraph.vs.select(_indegree_gt=0, _outdegree_gt=0)) > 0:
            yield subgraph
Ejemplo n.º 3
0
def load(collection: CollectionLazy) -> Iterator[ig.Graph]:
    """
    Takes in a collection of bibliographic records and gets out all the
    connected components of their citation graph.

    :param CollectionLazy collection: bibliographic collection
    :return: iterator over the connected components
    """
    vertices = {}
    pair_labels = []
    for article, reference in collection.citation_pairs(
            pair_parser=collection.metadata_pair_parser):
        art_label, vertices[art_label] = article
        ref_label, vertices[ref_label] = reference
        pair_labels.append((art_label, ref_label))

    graph = ig.Graph(directed=True)
    for label, attrs in vertices.items():
        graph.add_vertex(name=label, label=label, **attrs)

    graph.add_edges(pair_labels)
    graph = graph.simplify()
    valid_vs = graph.vs.select(lambda v: v["label"].lower() != "null").indices
    graph = graph.subgraph(valid_vs)
    valid_vs = graph.vs.select(
        lambda v: v.indegree() != 1 or v.outdegree() != 0).indices
    graph = graph.subgraph(valid_vs)
    for component in graph.clusters(MODE_WEAK):
        subgraph = graph.subgraph(component)
        if len(subgraph.vs.select(_indegree_gt=0, _outdegree_gt=0)) > 0:
            yield subgraph
Ejemplo n.º 4
0
def test_collection():
    """
    Just kinda an end to end test.
    """
    collection = CollectionLazy('docs/examples/bit-pattern-savedrecs.txt')
    for article in collection.articles:
        assert article.TI
Ejemplo n.º 5
0
def to_json(sources, output, raw):
    """
    Build a collection by using the sources and print the entries converted to
    to json format or dumps them in the `output`.
    """
    if not len(sources) > 0:
        click.secho("You should give at least a file with documents.", fg="red")
        return

    collection = CollectionLazy.from_filenames(*[f.name for f in sources])
    length = len(collection)
    output.write("[\n")
    for i, article in enumerate(collection.articles):
        fields = field_keys() if raw else field_aliases()

        text = json.dumps(
            {field: article.data[field] for field in fields if field in article},
            indent=2,
        )
        text = "  " + "\n  ".join(text.split("\n"))

        output.write(text)

        if i + 1 < length:
            output.write(",\n")
        else:
            output.write("\n")
    output.write("]")
Ejemplo n.º 6
0
def test_collection_from_glob():
    collection = CollectionLazy.from_glob("docs/examples/*.txt")
    for article in collection.articles:
        assert isinstance(article, Article)

    assert len(list(collection.articles)) == 500

    for file in collection.files:
        assert hasattr(file, "read")
        assert isinstance(file, (io.StringIO, io.TextIOWrapper))
        assert file.tell() == 0
Ejemplo n.º 7
0
def test_collection_from_streams(filename_single_document):
    with open(filename_single_document) as file:
        _ = file.read()

        collection = CollectionLazy(file)
        for article in collection.articles:
            assert isinstance(article, Article)

        for file in collection.files:
            assert hasattr(file, "read")
            assert isinstance(file, (io.StringIO, io.TextIOWrapper))
            assert file.tell() == 0
Ejemplo n.º 8
0
def citation_pairs(sources, output):
    """
    Build a collection by using the sources and print the citation pairs in json
    format or dumps them in the `output`.
    """
    if not len(sources) > 0:
        click.secho("You should give at least a file with documents.", fg="red")
        return

    collection = CollectionLazy.from_filenames(*[f.name for f in sources])
    pairs = collection.citation_pairs()

    json.dump(pairs, output, indent=2)
Ejemplo n.º 9
0
def main(file, output):
    """ Extrae las caracteristicas básicas de la red """
    collection = CollectionLazy.from_filenames(file)
    citation_pairs = collection.citation_pairs()
    citation_pairs = list(citation_pairs)

    vertices = set()
    for a, b in citation_pairs:
        vertices.add(a)
        vertices.add(b)

    vertices = list(vertices)

    graph = igraph.Graph(directed=True)
    graph.add_vertices(vertices)
    graph.add_edges(citation_pairs)

    graph = graph.components(mode="weak").giant()
    graph = graph.simplify()

    selected = graph.vs.select(
        lambda node: not (node.indegree() == 1 and node.outdegree() == 0))
    graph = graph.subgraph(selected)

    # max_cluster = max(graph.biconnected_components().sizes())
    # for i in graph.biconnected_components():
    #     if len(i) == max_cluster:
    #         cluster_biconnected = graph.subgraph(i)

    graph_undirected = graph.as_undirected()
    max_blondel = max(
        graph_undirected.community_multilevel(weights=None,
                                              return_levels=False).sizes())
    for cluster in graph_undirected.community_multilevel(weights=None,
                                                         return_levels=False):
        if len(cluster) == max_blondel:
            cluster_blondel = graph_undirected.subgraph(cluster)

    features = pd.DataFrame(index=[file])
    for i in graph, cluster_blondel:
        if i == graph:
            j = "Complet"
        # elif i == cluster_biconnected:
        #     j = "Biconnected_C"
        else:
            j = "Blondel_C"
        nodes = i.vcount()
        edges = i.ecount()
        features["Nodes " + j] = nodes
        features["Edges " + j] = edges

        max_degree = i.maxdegree()
        degree = i.degree()
        betweenness = i.betweenness()
        closeness = i.closeness()

        diameter = i.diameter()
        density = i.density()
        clusters_biconnected = len(i.biconnected_components())
        features["Diameter " + j] = diameter
        features["Density " + j] = density
        features["Cluster biconected " + j] = clusters_biconnected

        mean_degree = st.mean(degree)
        sd_degree = st.stdev(degree)
        var_degree = st.variance(degree)
        mean_indegree = st.mean(i.indegree())
        mean_outdegree = st.mean(i.outdegree())
        features["Max degree " + j] = max_degree
        features["Mean degree " + j] = mean_degree
        features["Sd degree " + j] = sd_degree
        features["Var degree " + j] = var_degree
        features["Mean indegree " + j] = mean_indegree
        features["Mean outdegree " + j] = mean_outdegree

        mean_betweenness = st.mean(betweenness)
        sd_betw = st.stdev(betweenness)
        var_betw = st.variance(betweenness)
        features["Mean betweenness " + j] = mean_betweenness
        features["Sd betweenness " + j] = sd_betw
        features["Var betweenness " + j] = var_betw

        mean_closeness = st.mean(closeness)
        sd_closeness = st.stdev(closeness)
        var_closeness = st.variance(closeness)
        features["Mean closeness " + j] = mean_closeness
        features["Sd closeness " + j] = sd_closeness
        features["Var closeness " + j] = var_closeness

    features.to_csv(output)