Exemple #1
0
 def __init__(self, model: CommunitiesModel, batch_size: int, session: Session, table: str):
     self._log = logging.getLogger("BatchedCommunityResolver")
     self.resolver = progress_bar(
         BatchedHashResolver(self._gen_hashes(model), batch_size, session, table),
         self._log, expected_size=model.count_elements()
     )
     self._prev = None, None, None
Exemple #2
0
def evaluate_communities(args):
    log = logging.getLogger("evalcc")
    model = CommunitiesModel().load(args.input)
    patch_tables(args)
    spark = create_spark("evalcc-%s" % uuid4(), **args.__dict__)
    log.info("Preparing the communities' RDD")
    items = []
    for i, c in progress_bar(enumerate(model.communities), log,
                             expected_size=len(model.communities)):
        for m in c:
            if m < len(model.id_to_element):
                items.append(Row(sha1=model.id_to_element[m], community=i))
    log.info("Running")
    items_in_spark = spark.sparkContext.parallelize(items).toDF()
    bags = spark \
        .read \
        .format("org.apache.spark.sql.cassandra") \
        .options(table=args.tables["bags"], keyspace=args.keyspace) \
        .load()
    log.info("Loaded the bags, calculating the vocabulary")
    vocabulary = bags.drop("sha1", "value").distinct().rdd.map(lambda x: x.item).collect()
    vocabulary = {v: i for i, v in enumerate(vocabulary)}
    log.info("Vocabulary size: %d", len(vocabulary))
    element_to_id = {e: i for i, e in enumerate(model.id_to_element)}
    metrics = items_in_spark.join(bags, "sha1").rdd \
        .map(lambda r: (r.community, (element_to_id[r.sha1], vocabulary[r.item], r.value))) \
        .groupByKey() \
        .map(CommunityEvaluator(args.threshold, len(vocabulary))) \
        .reduce(lambda v1, v2: [v1[i] + v2[i] for i in range(4)])
    log.info("Total misses: %d", metrics[0])
    log.info("Average normalized misses: %f", metrics[1] / len(model.communities))
    log.info("Total loss: %f", metrics[2])
    log.info("Average normalized loss: %f", numpy.sqrt(metrics[3] / len(model.communities)))
Exemple #3
0
    def convert(self, models_path: List[str], destdir: str) -> int:
        """
        Performs the model -> model conversion. Runs the conversions in a pool of processes.

        :param models_path: List of Models path.
        :param destdir: The directory where to store the models. The directory structure is \
                        preserved.
        :return: The number of converted files.
        """
        files = list(models_path)
        self._log.info("Found %d files", len(files))
        if not files:
            return 0
        queue_in = multiprocessing.Manager().Queue()
        queue_out = multiprocessing.Manager().Queue(1)
        processes = [multiprocessing.Process(target=self._process_entry,
                                             args=(i, destdir, queue_in, queue_out))
                     for i in range(self.num_processes)]
        for p in processes:
            p.start()
        for f in files:
            queue_in.put(f)
        for _ in processes:
            queue_in.put(None)
        failures = 0
        for _ in progress_bar(files, self._log, expected_size=len(files)):
            filename, ok = queue_out.get()
            if not ok:
                failures += 1
        for p in processes:
            p.join()
        self._log.info("Finished, %d failed files", failures)
        return len(files) - failures
Exemple #4
0
 def __init__(self, model: CommunitiesModel, batch_size: int, session: Session, table: str):
     self._log = logging.getLogger("BatchedCommunityResolver")
     self.resolver = progress_bar(
         BatchedHashResolver(self._gen_hashes(model), batch_size, session, table),
         self._log, expected_size=model.count_elements()
     )
     self._prev = None, None, None
Exemple #5
0
    def convert(self, srcdir: str, destdir: str, pattern: str="**/*.asdf") -> int:
        """
        Performs the model -> model conversion. Runs the conversions in a pool of processes.

        :param srcdir: The directory to scan for the models.
        :param destdir: The directory where to store the models. The directory structure is \
                        preserved.
        :param pattern: glob pattern for the files.
        :return: The number of converted files.
        """
        self._log.info("Scanning %s", srcdir)
        files = [str(p) for p in Path(srcdir).glob(pattern)]
        self._log.info("Found %d files", len(files))
        if not files:
            return 0
        queue_in = multiprocessing.Manager().Queue()
        queue_out = multiprocessing.Manager().Queue(1)
        processes = [multiprocessing.Process(target=self._process_entry,
                                             args=(i, destdir, srcdir, queue_in, queue_out))
                     for i in range(self.num_processes)]
        for p in processes:
            p.start()
        for f in files:
            queue_in.put(f)
        for _ in processes:
            queue_in.put(None)
        failures = 0
        for _ in progress_bar(files, self._log, expected_size=len(files)):
            filename, ok = queue_out.get()
            if not ok:
                failures += 1
        for p in processes:
            p.join()
        self._log.info("Finished, %d failed files", failures)
        return len(files) - failures
 def fetch_model(self,
                 source: str,
                 file: Union[str, BinaryIO],
                 chunk_size: int = DEFAULT_CHUNK_SIZE) -> None:
     self._log.info("Fetching %s...", source)
     r = requests.get(source, stream=True)
     if r.status_code != 200:
         self._log.error(
             "An error occurred while fetching the model, with code %s" %
             r.status_code)
         raise ValueError
     if isinstance(file, str):
         os.makedirs(os.path.dirname(file), exist_ok=True)
         f = open(file, "wb")
     else:
         f = file
     try:
         total_length = int(r.headers.get("content-length"))
         num_chunks = math.ceil(total_length / chunk_size)
         if num_chunks == 1:
             f.write(r.content)
         else:
             for chunk in progress_bar(
                     r.iter_content(chunk_size=chunk_size),
                     self._log,
                     expected_size=num_chunks):
                 if chunk:
                     f.write(chunk)
     finally:
         if isinstance(file, str):
             f.close()
Exemple #7
0
def evaluate_communities(args):
    log = logging.getLogger("evalcc")
    model = CommunitiesModel().load(args.input)
    patch_tables(args)
    spark = create_spark("evalcc-%s" % uuid4(), **args.__dict__)
    log.info("Preparing the communities' RDD")
    items = []
    for i, c in progress_bar(enumerate(model.communities), log,
                             expected_size=len(model.communities)):
        for m in c:
            if m < len(model.id_to_element):
                items.append(Row(sha1=model.id_to_element[m], community=i))
    log.info("Running")
    items_in_spark = spark.sparkContext.parallelize(items).toDF()
    bags = spark \
        .read \
        .format("org.apache.spark.sql.cassandra") \
        .options(table=args.tables["bags"], keyspace=args.keyspace) \
        .load()
    log.info("Loaded the bags, calculating the vocabulary")
    vocabulary = bags.drop("sha1", "value").distinct().rdd.map(lambda x: x.item).collect()
    vocabulary = {v: i for i, v in enumerate(vocabulary)}
    log.info("Vocabulary size: %d", len(vocabulary))
    element_to_id = {e: i for i, e in enumerate(model.id_to_element)}
    metrics = items_in_spark.join(bags, "sha1").rdd \
        .map(lambda r: (r.community, (element_to_id[r.sha1], vocabulary[r.item], r.value))) \
        .groupByKey() \
        .map(CommunityEvaluator(args.threshold, len(vocabulary))) \
        .reduce(lambda v1, v2: [v1[i] + v2[i] for i in range(4)])
    log.info("Total misses: %d", metrics[0])
    log.info("Average normalized misses: %f", metrics[1] / len(model.communities))
    log.info("Total loss: %f", metrics[2])
    log.info("Average normalized loss: %f", numpy.sqrt(metrics[3] / len(model.communities)))
Exemple #8
0
 def test_progress_bar(self):
     logger = logging.getLogger("progress")
     logger.setLevel(logging.INFO)
     stream = io.StringIO()
     stream.isatty = lambda: True
     progress.STREAM = stream
     list(progress_bar.progress_bar(range(10), logger, expected_size=10))
     self.assertEqual(stream.getvalue().strip()[-51:],
                      "[################################] 10/10 - 00:00:00")
     progress.STREAM = sys.stderr
Exemple #9
0
    def transform(self, repos, output, num_processes=None):
        """
        Converts repositories to models and saves them to the output directory.

        :param repos: "repos" is the list of repository URLs or paths or \
                  files with repository URLS or paths.
        :param output: "output" is the output directory where to store the \
                        results.
        :param num_processes: number of processes to use, if negative - use all \
               CPUs.
        :return: None
        """
        self._args["log_level"] = self._log.level
        if num_processes is None:
            num_processes = self.num_processes
        if num_processes < 0:
            num_processes = multiprocessing.cpu_count()

        inputs = []

        if isinstance(repos, str):
            repos = [repos]

        for repo in repos:
            # check if it's a text file
            if os.path.isfile(repo):
                with open(repo) as f:
                    inputs.extend(l.strip() for l in f)
            else:
                inputs.append(repo)

        os.makedirs(output, exist_ok=True)

        queue = multiprocessing.Manager().Queue(1)
        failures = 0
        with multiprocessing.Pool(processes=num_processes) as pool:
            pool.starmap_async(
                type(self).process_entry,
                zip(inputs, repeat(self._args), repeat(output), repeat(queue),
                    repeat(self._organize_files)))

            for _ in progress_bar(inputs, self._log, expected_size=len(inputs)):
                repo, ok = queue.get()
                if not ok:
                    failures += 1

        self._log.info("Finished, %d failed repos", failures)
        return len(inputs) - failures
Exemple #10
0
def convert_bow_to_vw(bow: BOW, output: str):
    log = logging.getLogger("bow2vw")
    log.info("Writing %s", output)
    with open(output, "w") as fout:
        for index in progress_bar(bow, log, expected_size=len(bow)):
            record = bow[index]
            fout.write(record[0].replace(":", "").replace(" ", "_") + " ")
            pairs = []
            for t, v in zip(*record[1:]):
                try:
                    word = bow.tokens[t]
                except (KeyError, IndexError):
                    log.warning("%d not found in the vocabulary", t)
                    continue
                pairs.append("%s:%s" % (word, v))
            fout.write(" ".join(pairs))
            fout.write("\n")
Exemple #11
0
def load_and_check(filepaths: list, log: logging.Logger):
    """
    Load Cooccurrences models from filepaths list and perform simple check:
    1. If model contains values more than MAX_INT32 we saturate.
    2. If model contains negative values we consider it as corrupted, report and skip.
    """
    for path in progress_bar(filepaths, log):
        coocc = Cooccurrences().load(path)
        negative_values = np.where(coocc.matrix.data < 0)
        if negative_values[0].size > 0:
            log.warning("Model %s is corrupted and will be skipped. "
                        "It contains negative elements.", path)
            continue
        too_big_values = np.where(coocc.matrix.data > MAX_INT32)
        if too_big_values[0].size > 0:
            log.warning("Model %s contains elements with values more than MAX_INT32. "
                        "They will be saturated to MAX_INT32", path)
            coocc.matrix.data[too_big_values] = MAX_INT32
        yield path, coocc
Exemple #12
0
def download(source: str,
             file: Union[str, BinaryIO],
             log: logging.Logger,
             chunk_size: int = -1) -> None:
    """
    Download a file from an HTTP source.

    :param source: URL to fetch.
    :param file: Where to store the downloaded data.
    :param log: Logger.
    :param chunk_size: Size of the download buffer.
    """
    log.info("Fetching %s...", source)
    if chunk_size < 0:
        chunk_size = DEFAULT_DOWNLOAD_CHUNK_SIZE
    r = requests.get(source, stream=True)
    if r.status_code != 200:
        log.error("An error occurred while fetching the model, with code %s" %
                  r.status_code)
        raise ValueError
    if isinstance(file, str):
        os.makedirs(os.path.dirname(file), exist_ok=True)
        f = open(file, "wb")
    else:
        f = file
    try:
        total_length = int(r.headers.get("content-length"))
        num_chunks = math.ceil(total_length / chunk_size)
        if num_chunks == 1:
            f.write(r.content)
        else:
            for chunk in progress_bar(r.iter_content(chunk_size=chunk_size),
                                      log,
                                      expected_size=num_chunks):
                if chunk:
                    f.write(chunk)
    finally:
        if isinstance(file, str):
            f.close()
Exemple #13
0
def read_identifiers(csv_path: str,
                     use_header: bool,
                     max_identifier_len: int,
                     identifier_col: int,
                     split_identifier_col: int,
                     shuffle: bool = True) -> List[str]:
    """
    Reads and filters too long identifiers in the CSV file.

    :param csv_path: path to the CSV file.
    :param use_header: uses header as normal line (True) or treat as header line with column names.
    :param max_identifier_len: maximum length of raw identifiers. Skip identifiers that are longer.
    :param identifier_col: column name in the CSV file for the raw identifier.
    :param split_identifier_col: column name in the CSV file for the splitted identifier lowercase.
    :param shuffle: indicates whether to reorder the list of identifiers
        at random after reading it.
    :return: list of splitted identifiers.
    """
    log = logging.getLogger("read_identifiers")
    log.info("Reading data from the CSV file %s", csv_path)
    identifiers = []
    # TODO: Update dataset loading as soon as https://github.com/src-d/backlog/issues/1212 done
    # Think about dataset download step
    with tarfile.open(csv_path, encoding="utf-8") as f:
        assert len(
            f.members) == 1, "One archived file is expected, got: %s" % len(
                f.members)
        content = f.extractfile(f.members[0])
        if not use_header:
            content.readline()
        for line in progress_bar(content.readlines(), log):
            row = line.decode("utf-8").strip().split(",")
            if len(row[identifier_col]) <= max_identifier_len:
                identifiers.append(row[split_identifier_col])
    if shuffle:
        numpy.random.shuffle(identifiers)
    log.info("Number of identifiers after filtering: %s." % len(identifiers))
    return identifiers
Exemple #14
0
 def _fetch(self, url, where, chunk_size=DEFAULT_CHUNK_SIZE):
     self._log.info("Fetching %s...", url)
     r = requests.get(url, stream=True)
     if isinstance(where, str):
         os.makedirs(os.path.dirname(where), exist_ok=True)
         f = open(where, "wb")
     else:
         f = where
     try:
         total_length = int(r.headers.get("content-length"))
         num_chunks = math.ceil(total_length / chunk_size)
         if num_chunks == 1:
             f.write(r.content)
         else:
             for chunk in progress_bar(
                     r.iter_content(chunk_size=chunk_size),
                     self._log,
                     expected_size=num_chunks):
                 if chunk:
                     f.write(chunk)
     finally:
         if isinstance(where, str):
             f.close()
    def get_dependent_reps(self, libs_info, save_to=None):
        """
        Creates pandas dataframe with all information about dependent repositories from libraries.

        :param libs_info: Pandas dataframe with all information about libraries.
        :param save_to: Path to save pandas dataframe with all information about libraries if you \
            want to save it.
        :return: Pandas dataframe with all information about dependent repositories.
        """
        self._log.info("Creating list of dependent repos...")
        if hasattr(libs_info["ID"], "tolist"):
            lib_id2name = dict(
                zip(libs_info["ID"].tolist(), libs_info["Name"].tolist()))
        else:
            lib_id2name = {libs_info["ID"]: libs_info["Name"]}
        pd_result = []
        dependencies_path = os.path.join(self._librariesio_path,
                                         dependencies_filename)
        for chunk in progress_bar(pd.read_csv(
                dependencies_path,
                chunksize=LibrariesIOFetcher.CHUNKSIZE,
                index_col=False),
                                  self._log,
                                  expected_size=100):
            for lib_id in lib_id2name:
                res = chunk[chunk["Dependency Project ID"] == int(lib_id)]
                if len(res) > 0:
                    pd_result.append(res)

        pd_result = pd.concat(pd_result)
        pd_result["url"] = "https://" + \
                           pd_result["Host Type"].map(LibrariesIOFetcher.HOST2LINK) + \
                           pd_result["Repository Name with Owner"]
        if save_to:
            pd_result.to_csv(save_to, index=False)

        return pd_result
Exemple #16
0
def preprocess(args):
    inputs = []
    for i in args.input:
        if os.path.isdir(i):
            inputs.extend([os.path.join(i, f) for f in os.listdir(i)])
        else:
            inputs.append(i)
    all_words = defaultdict(int)
    skipped = 0

    for i, path in progress_bar(enumerate(inputs),
                                log,
                                expected_size=len(inputs)):
        try:
            model = Cooccurrences().load(source=path)
        except ValueError:
            skipped += 1
            continue
        for w in model.tokens:
            all_words[w] += 1

    vs = args.vocabulary_size
    if len(all_words) < vs:
        vs = len(all_words)
    sz = args.shard_size

    if vs < sz:
        raise ValueError("vocabulary_size={0} is less than shard_size={1}. "
                         "You should specify smaller shard_size "
                         "(pass shard_size={0} argument).".format(vs, sz))

    vs -= vs % sz
    words = numpy.array(list(all_words.keys()))
    freqs = numpy.array(list(all_words.values()), dtype=numpy.int64)
    del all_words
    chosen_indices = numpy.argpartition(freqs,
                                        len(freqs) - vs)[len(freqs) - vs:]
    chosen_freqs = freqs[chosen_indices]
    chosen_words = words[chosen_indices]
    border_freq = chosen_freqs.min()
    border_mask = chosen_freqs == border_freq
    border_num = border_mask.sum()
    border_words = words[freqs == border_freq]
    border_words = numpy.sort(border_words)
    chosen_words[border_mask] = border_words[:border_num]

    del words
    del freqs

    sorted_indices = numpy.argsort(chosen_words)
    chosen_freqs = chosen_freqs[sorted_indices]
    chosen_words = chosen_words[sorted_indices]
    word_indices = {w: i for i, w in enumerate(chosen_words)}
    if args.df is not None:
        model = DocumentFrequencies()
        model.construct(docs=len(inputs) - skipped,
                        tokens=chosen_words,
                        freqs=chosen_freqs)
        model.save(args.df)
    del chosen_freqs

    if not os.path.exists(args.output):
        os.makedirs(args.output)

    with open(os.path.join(args.output, "row_vocab.txt"), "w") as out:
        out.write('\n'.join(chosen_words))
    shutil.copyfile(os.path.join(args.output, "row_vocab.txt"),
                    os.path.join(args.output, "col_vocab.txt"))

    del chosen_words
    ccmatrix = csr_matrix((vs, vs), dtype=numpy.int64)
    for i, path in progress_bar(enumerate(inputs),
                                log,
                                expected_size=len(inputs)):
        try:
            model = Cooccurrences().load(path)
        except ValueError:
            continue
        if len(model) == 0:
            continue
        matrix = _extract_coocc_matrix(ccmatrix.shape, word_indices, model)
        # Stage 5 - simply add this converted matrix to the global one
        ccmatrix += matrix

    bool_sums = ccmatrix.indptr[1:] - ccmatrix.indptr[:-1]
    with open(os.path.join(args.output, "row_sums.txt"), "w") as out:
        out.write('\n'.join(map(str, bool_sums.tolist())))
    shutil.copyfile(os.path.join(args.output, "row_sums.txt"),
                    os.path.join(args.output, "col_sums.txt"))
    reorder = numpy.argsort(-bool_sums)
    os.makedirs(args.output, exist_ok=True)
    nshards = vs // args.shard_size
    for row in progress_bar(range(nshards), log, expected_size=nshards):
        for col in range(nshards):

            def _int64s(xs):
                return tf.train.Feature(int64_list=tf.train.Int64List(
                    value=list(xs)))

            def _floats(xs):
                return tf.train.Feature(float_list=tf.train.FloatList(
                    value=list(xs)))

            indices_row = reorder[row::nshards]
            indices_col = reorder[col::nshards]
            shard = ccmatrix[indices_row][:, indices_col].tocoo()

            example = tf.train.Example(features=tf.train.Features(
                feature={
                    "global_row": _int64s(indices_row),
                    "global_col": _int64s(indices_col),
                    "sparse_local_row": _int64s(shard.row),
                    "sparse_local_col": _int64s(shard.col),
                    "sparse_value": _floats(shard.data)
                }))

            with open(
                    os.path.join(args.output,
                                 "shard-%03d-%03d.pb" % (row, col)),
                    "wb") as out:
                out.write(example.SerializeToString())
Exemple #17
0
def detect_communities(args):
    log = logging.getLogger("cmd")
    ccsmodel = ConnectedComponentsModel().load(args.input)
    log.info("Building the connected components")
    ccs = defaultdict(list)
    for i, c in enumerate(ccsmodel.id_to_cc):
        ccs[c].append(i)
    buckmat = ccsmodel.id_to_buckets
    buckindices = buckmat.indices
    buckindptr = buckmat.indptr
    total_nvertices = buckmat.shape[0]
    linear = args.edges in ("linear", "1")
    graphs = []
    communities = []
    if not linear:
        log.info("Transposing the matrix")
        buckmat_csc = buckmat.T.tocsr()
    fat_ccs = []
    for vertices in ccs.values():
        if len(vertices) == 1:
            continue
        if len(vertices) == 2:
            communities.append(vertices)
            continue
        fat_ccs.append(vertices)
    log.info("Building %d graphs", len(fat_ccs))
    for vertices in progress_bar(fat_ccs, log, expected_size=len(fat_ccs)):
        if linear:
            edges = []
            weights = []
            bucket_weights = buckmat.sum(axis=0)
            buckets = set()
            for i in vertices:
                for j in range(buckindptr[i], buckindptr[i + 1]):
                    bucket = buckindices[j]
                    weights.append(bucket_weights[0, bucket])
                    bucket += total_nvertices
                    buckets.add(bucket)
                    edges.append((str(i), str(bucket)))
        else:
            edges = set()
            weights = None
            buckets = set()
            for i in vertices:
                for j in range(buckindptr[i], buckindptr[i + 1]):
                    buckets.add(buckindices[j])
            for bucket in buckets:
                buckverts = \
                    buckmat_csc.indices[buckmat_csc.indptr[bucket]:buckmat_csc.indptr[bucket + 1]]
                for i, x in enumerate(buckverts):
                    for y in buckverts:
                        if x < y:
                            edges.add((str(x), str(y)))
            buckets.clear()
            edges = list(edges)
        graph = Graph(directed=False)
        graph.add_vertices(list(map(str, vertices + list(buckets))))
        graph.add_edges(edges)
        graph.edge_weights = weights
        graphs.append(graph)
    log.info("Launching the community detection")
    detector = CommunityDetector(algorithm=args.algorithm, config=args.params)
    if not args.no_spark:
        spark = create_spark("cmd-%s" % uuid4(), **args.__dict__).sparkContext
        communities.extend(spark.parallelize(graphs).flatMap(detector).collect())
    else:
        communities.extend(chain.from_iterable(progress_bar(
            (detector(g) for g in graphs), log, expected_size=len(graphs))))
    log.info("Overall communities: %d", len(communities))
    log.info("Average community size: %.1f", numpy.mean([len(c) for c in communities]))
    log.info("Median community size: %.1f", numpy.median([len(c) for c in communities]))
    log.info("Max community size: %d", max(map(len, communities)))
    log.info("Writing %s", args.output)
    CommunitiesModel().construct(communities, ccsmodel.id_to_element).save(args.output)
Exemple #18
0
def detect_communities(args):
    log = logging.getLogger("cmd")
    ccsmodel = ConnectedComponentsModel().load(args.input)
    log.info("Building the connected components")
    ccs = defaultdict(list)
    for i, c in enumerate(ccsmodel.id_to_cc):
        ccs[c].append(i)
    buckmat = ccsmodel.id_to_buckets
    buckindices = buckmat.indices
    buckindptr = buckmat.indptr
    total_nvertices = buckmat.shape[0]
    linear = args.edges in ("linear", "1")
    graphs = []
    communities = []
    if not linear:
        log.info("Transposing the matrix")
        buckmat_csc = buckmat.T.tocsr()
    fat_ccs = []
    for vertices in ccs.values():
        if len(vertices) == 1:
            continue
        if len(vertices) == 2:
            communities.append(vertices)
            continue
        fat_ccs.append(vertices)
    log.info("Building %d graphs", len(fat_ccs))
    for vertices in progress_bar(fat_ccs, log, expected_size=len(fat_ccs)):
        if linear:
            edges = []
            weights = []
            bucket_weights = buckmat.sum(axis=0)
            buckets = set()
            for i in vertices:
                for j in range(buckindptr[i], buckindptr[i + 1]):
                    bucket = buckindices[j]
                    weights.append(bucket_weights[0, bucket])
                    bucket += total_nvertices
                    buckets.add(bucket)
                    edges.append((str(i), str(bucket)))
        else:
            edges = set()
            weights = None
            buckets = set()
            for i in vertices:
                for j in range(buckindptr[i], buckindptr[i + 1]):
                    buckets.add(buckindices[j])
            for bucket in buckets:
                buckverts = \
                    buckmat_csc.indices[buckmat_csc.indptr[bucket]:buckmat_csc.indptr[bucket + 1]]
                for i, x in enumerate(buckverts):
                    for y in buckverts:
                        if x < y:
                            edges.add((str(x), str(y)))
            buckets.clear()
            edges = list(edges)
        graph = Graph(directed=False)
        graph.add_vertices(list(map(str, vertices + list(buckets))))
        graph.add_edges(edges)
        graph.edge_weights = weights
        graphs.append(graph)
    log.info("Launching the community detection")
    detector = CommunityDetector(algorithm=args.algorithm, config=args.params)
    if not args.no_spark:
        spark = create_spark("cmd-%s" % uuid4(), **args.__dict__).sparkContext
        communities.extend(spark.parallelize(graphs).flatMap(detector).collect())
    else:
        communities.extend(chain.from_iterable(progress_bar(
            (detector(g) for g in graphs), log, expected_size=len(graphs))))
    log.info("Overall communities: %d", len(communities))
    log.info("Average community size: %.1f", numpy.mean([len(c) for c in communities]))
    log.info("Median community size: %.1f", numpy.median([len(c) for c in communities]))
    log.info("Max community size: %d", max(map(len, communities)))
    log.info("Writing %s", args.output)
    CommunitiesModel().construct(communities, ccsmodel.id_to_element).save(args.output)
Exemple #19
0
def id2vec_preprocess(args):
    """
    Loads co-occurrence matrices for several repositories and generates the
    document frequencies and the Swivel protobuf dataset.

    :param args: :class:`argparse.Namespace` with "input", "vocabulary_size", \
                 "shard_size", "df" and "output".
    :return: None
    """
    log = logging.getLogger("preproc")
    log.info("Loading docfreq model from %s", args.docfreq_in)
    df_model = DocumentFrequencies(log_level=args.log_level).load(
        source=args.docfreq_in)
    coocc_model = Cooccurrences().load(args.input)
    if numpy.any(coocc_model.matrix.data < 0):
        raise ValueError(
            ("Co-occurrence matrix %s contains negative elements. "
             "Please check its correctness.") % args.input)
    if numpy.any(numpy.isnan(coocc_model.matrix.data)):
        raise ValueError(("Co-occurrence matrix %s contains nan elements. "
                          "Please check its correctness.") % args.input)

    try:
        df_meta = coocc_model.get_dep(DocumentFrequencies.NAME)
        if df_model.meta != df_meta:
            raise ValueError((
                "Document frequency model you provided does not match dependency inside "
                "Cooccurrences model:\nargs.docfreq.meta:\n%s\ncoocc_model.get_dep"
                "(\"docfreq\")\n%s\n") % (df_model.meta, df_meta))
    except KeyError:
        pass  # There is no docfreq dependency

    vs = args.vocabulary_size
    if len(df_model) < vs:
        vs = len(df_model)
    sz = args.shard_size
    if vs < sz:
        raise ValueError(
            "vocabulary_size=%s is less than shard_size=%s. You should specify a smaller "
            "shard_size (e.g. shard_size=%s)." % (vs, sz, vs))
    vs -= vs % sz
    log.info("Effective vocabulary size: %d", vs)
    df_model = df_model.greatest(vs)
    log.info("Sorting the vocabulary...")
    chosen_words = sorted(df_model.tokens())
    word_indices = {w: i for i, w in enumerate(chosen_words)}

    if not os.path.exists(args.output):
        os.makedirs(args.output)
    with open(os.path.join(args.output, "row_vocab.txt"), "w") as out:
        out.write('\n'.join(chosen_words))
    log.info("Saved row_vocab.txt")
    shutil.copyfile(os.path.join(args.output, "row_vocab.txt"),
                    os.path.join(args.output, "col_vocab.txt"))
    log.info("Saved col_vocab.txt")
    del chosen_words

    ccmatrix = extract_coocc_matrix((vs, vs), word_indices, coocc_model)

    log.info("Planning the sharding...")
    bool_sums = ccmatrix.indptr[1:] - ccmatrix.indptr[:-1]
    reorder = numpy.argsort(-bool_sums)
    with open(os.path.join(args.output, "row_sums.txt"), "w") as out:
        out.write('\n'.join(map(str, bool_sums.tolist())))
    log.info("Saved row_sums.txt")
    shutil.copyfile(os.path.join(args.output, "row_sums.txt"),
                    os.path.join(args.output, "col_sums.txt"))
    log.info("Saved col_sums.txt")

    log.info("Writing the shards...")
    os.makedirs(args.output, exist_ok=True)
    nshards = vs // args.shard_size
    for row in progress_bar(range(nshards), log, expected_size=nshards):
        for col in range(nshards):
            indices_row = reorder[row::nshards]
            indices_col = reorder[col::nshards]
            shard = ccmatrix[indices_row][:, indices_col].tocoo()

            example = tf.train.Example(features=tf.train.Features(
                feature={
                    "global_row": _int64s(indices_row),
                    "global_col": _int64s(indices_col),
                    "sparse_local_row": _int64s(shard.row),
                    "sparse_local_col": _int64s(shard.col),
                    "sparse_value": _floats(shard.data)
                }))

            with open(
                    os.path.join(args.output,
                                 "shard-%03d-%03d.pb" % (row, col)),
                    "wb") as out:
                out.write(example.SerializeToString())
    log.info("Success")
Exemple #20
0
def preprocess(args):
    """
    Loads co-occurrence matrices for several repositories and generates the
    document frequencies and the Swivel protobuf dataset.

    :param args: :class:`argparse.Namespace` with "input", "vocabulary_size", \
                 "shard_size", "df" and "output".
    :return: None
    """
    log = logging.getLogger("preproc")
    log.info("Scanning the inputs...")
    inputs = []
    for i in args.input:
        if os.path.isdir(i):
            inputs.extend([os.path.join(i, f) for f in os.listdir(i)])
        else:
            inputs.append(i)
    log.info("Reading word indices from %d files...", len(inputs))
    all_words = defaultdict(int)
    skipped = 0
    for i, path in progress_bar(enumerate(inputs), log, expected_size=len(inputs)):
        try:
            model = Cooccurrences().load(source=path)
        except ValueError:
            skipped += 1
            log.warning("Skipped %s", path)
            continue
        for w in model.tokens:
            all_words[w] += 1
    vs = args.vocabulary_size
    if len(all_words) < vs:
        vs = len(all_words)
    sz = args.shard_size
    if vs < sz:
        raise ValueError(
            "vocabulary_size={0} is less than shard_size={1}. "
            "You should specify smaller shard_size "
            "(pass shard_size={0} argument).".format(vs, sz))
    vs -= vs % sz
    log.info("Effective vocabulary size: %d", vs)
    log.info("Truncating the vocabulary...")
    words = numpy.array(list(all_words.keys()))
    freqs = numpy.array(list(all_words.values()), dtype=numpy.int64)
    del all_words
    chosen_indices = numpy.argpartition(
        freqs, len(freqs) - vs)[len(freqs) - vs:]
    chosen_freqs = freqs[chosen_indices]
    chosen_words = words[chosen_indices]
    border_freq = chosen_freqs.min()
    border_mask = chosen_freqs == border_freq
    border_num = border_mask.sum()
    border_words = words[freqs == border_freq]
    border_words = numpy.sort(border_words)
    chosen_words[border_mask] = border_words[:border_num]
    del words
    del freqs
    log.info("Sorting the vocabulary...")
    sorted_indices = numpy.argsort(chosen_words)
    chosen_freqs = chosen_freqs[sorted_indices]
    chosen_words = chosen_words[sorted_indices]
    word_indices = {w: i for i, w in enumerate(chosen_words)}
    if args.df is not None:
        log.info("Writing the document frequencies to %s...", args.df)
        model = DocumentFrequencies()
        model.construct(docs=len(inputs) - skipped, tokens=chosen_words, freqs=chosen_freqs)
        model.save(args.df)
    del chosen_freqs

    if not os.path.exists(args.output):
        os.makedirs(args.output)

    with open(os.path.join(args.output, "row_vocab.txt"), "w") as out:
        out.write('\n'.join(chosen_words))
    log.info("Saved row_vocab.txt...")
    shutil.copyfile(os.path.join(args.output, "row_vocab.txt"),
                    os.path.join(args.output, "col_vocab.txt"))
    log.info("Saved col_vocab.txt...")

    del chosen_words
    log.info("Combining individual co-occurrence matrices...")
    ccmatrix = csr_matrix((vs, vs), dtype=numpy.int64)
    for i, path in progress_bar(enumerate(inputs), log, expected_size=len(inputs)):
        try:
            model = Cooccurrences().load(path)
        except ValueError:
            log.warning("Skipped %s", path)
            continue
        if len(model) == 0:
            log.warning("Skipped %s", path)
            continue
        matrix = _extract_coocc_matrix(ccmatrix.shape, word_indices, model)
        # Stage 5 - simply add this converted matrix to the global one
        ccmatrix += matrix

    log.info("Planning the sharding...")
    bool_sums = ccmatrix.indptr[1:] - ccmatrix.indptr[:-1]
    with open(os.path.join(args.output, "row_sums.txt"), "w") as out:
        out.write('\n'.join(map(str, bool_sums.tolist())))
    log.info("Saved row_sums.txt...")
    shutil.copyfile(os.path.join(args.output, "row_sums.txt"),
                    os.path.join(args.output, "col_sums.txt"))
    log.info("Saved col_sums.txt...")
    reorder = numpy.argsort(-bool_sums)
    log.info("Writing the shards...")
    os.makedirs(args.output, exist_ok=True)
    nshards = vs // args.shard_size
    for row in progress_bar(range(nshards), log, expected_size=nshards):
        for col in range(nshards):
            def _int64s(xs):
                return tf.train.Feature(
                    int64_list=tf.train.Int64List(value=list(xs)))

            def _floats(xs):
                return tf.train.Feature(
                    float_list=tf.train.FloatList(value=list(xs)))

            indices_row = reorder[row::nshards]
            indices_col = reorder[col::nshards]
            shard = ccmatrix[indices_row][:, indices_col].tocoo()

            example = tf.train.Example(features=tf.train.Features(feature={
                "global_row": _int64s(indices_row),
                "global_col": _int64s(indices_col),
                "sparse_local_row": _int64s(shard.row),
                "sparse_local_col": _int64s(shard.col),
                "sparse_value": _floats(shard.data)}))

            with open(os.path.join(args.output,
                                   "shard-%03d-%03d.pb" % (row, col)),
                      "wb") as out:
                out.write(example.SerializeToString())
    log.info("Success")