Ejemplo n.º 1
0
def run(api_host, revisions, labels_f, trusted_groups, trusted_edits,
        revert_radius, revert_window, exclude_reverted, exclude_reverting,
        threads, verbose):

    # Construct our API session
    session = mwapi.Session(
        api_host,
        user_agent="wikimedia scoring platform/editquality -- autolabel")

    autolabel = autolabeler(session, trusted_groups, trusted_edits,
                            revert_radius, revert_window, exclude_reverted,
                            exclude_reverting)

    revisions, revisions2 = tee(revisions)
    number_of_revisions = sum(1 for line in revisions2)
    rev_id_chunks = chunk(revisions, 50)
    tq = tqdm(para.map(autolabel, rev_id_chunks, mappers=threads),
              file=sys.stderr,
              total=number_of_revisions)
    verbose_result = ''
    for revision in tq:
        if verbose:
            if not revision['autolabel']['needs_review']:
                verbose_result += '.'
            else:
                verbose_result += (revision['autolabel']['review_reason']
                                   or "?")[0]

        labels_f.write(json.dumps(revision))
        labels_f.write("\n")

    if verbose:
        sys.stderr.write(verbose_result + "\n")
        sys.stderr.flush()
Ejemplo n.º 2
0
def run(api_host, revisions, labels_f, trusted_groups, trusted_edits,
        revert_radius, revert_window, exclude_reverted, exclude_reverting,
        threads, verbose):

    # Construct our API session
    session = mwapi.Session(
        api_host, user_agent="wiki-ai/editquality -- autolabel script")

    autolabel = autolabeler(session, trusted_groups, trusted_edits,
                            revert_radius, revert_window, exclude_reverted,
                            exclude_reverting)

    rev_id_chunks = chunk(revisions, 50)
    for revision in para.map(autolabel, rev_id_chunks, mappers=threads):
        if verbose:
            if not revision['autolabel']['needs_review']:
                sys.stderr.write(".")
            else:
                sys.stderr.write((revision['autolabel']['review_reason']
                                  or "?")[0])

            sys.stderr.flush()

        labels_f.write(json.dumps(revision))
        labels_f.write("\n")

    if verbose:
        sys.stderr.write("\n")
        sys.stderr.flush()
Ejemplo n.º 3
0
def process_all(paths):
    git_root_dir = "/export/scratch2/levon003/repos/wiki-ores-feedback"
    derived_data_dir = os.path.join(git_root_dir, "data", "derived")
    working_dir = os.path.join(derived_data_dir, 'stub-history-all-revisions',
                               'oidb')
    os.makedirs(working_dir, exist_ok=True)

    #output_filepath = os.path.join(working_dir, 'oidb.sqlite')
    #engine = get_engine(output_filepath)
    #create_tables(engine)

    #metadata = MetaData(bind=engine)
    #metadata.reflect()

    #page_metadata = metadata.tables['page_metadata']
    #revision = metadata.tables['revision']
    #conn = engine.connect()

    start = datetime.now()
    processed_count = 0
    page_processed_count = 0
    curr_batch = []
    FORCE_COMMIT_SIZE = 100000

    with open(os.path.join(working_dir, 'revs_unsorted.tsv'),
              'w') as outfile, open(os.path.join(working_dir, 'page.ndjson'),
                                    'w') as page_outfile:
        for result in para.map(process_stub_history_filepath,
                               paths,
                               mappers=len(paths)):
            if 'wiki_namespace' in result:
                # this is a page result
                page_outfile.write(json.dumps(result) + "\n")
                page_processed_count += 1
                if page_processed_count % 100000 == 0:
                    print(
                        f"Processed {page_processed_count} pages in {datetime.now() - start}"
                    )
            else:
                #outfile.write(json.dumps(result) + "\n")
                outfile.write(
                    "{rev_timestamp}\t{page_id}\t{rev_id}\t{prev_rev_id}\t{is_minor}\t{user_text}\t{user_id}\t{seconds_to_prev}\t{curr_bytes}\t{delta_bytes}\t{has_edit_summary}\t{is_reverted}\t{is_revert}\t{is_reverted_to_by_other}\t{is_self_reverted}\t{is_self_revert}\t{revert_target_id}\t{revert_set_size}\t{revert_id}\t{seconds_to_revert}\n"
                    .format(**result))

                #curr_batch.append(result)
                #if len(curr_batch) >= FORCE_COMMIT_SIZE:
                #    conn.execute(revision.insert(), curr_batch)
                #    curr_batch = []
                processed_count += 1
                if processed_count % 1000000 == 0:
                    print(
                        f"Processed {processed_count} revisions in {datetime.now() - start}"
                    )
    #if len(curr_batch) > 0:
    #    conn.execute(revision.insert(), curr_batch)
    print(
        f"Finished processing {processed_count} revisions (and {page_processed_count} pages) in {datetime.now() - start}"
    )
Ejemplo n.º 4
0
def run(paths, rate):

    writer = mysqltsv.Writer(sys.stdout, headers=HEADERS)

    def process_path(path):
        f = mwcli.files.reader(path)

        return sample_tokens((json.loads(line) for line in f), rate)

    for values in para.map(process_path, paths):
        writer.write(values)
def run(paths, rate):

    writer = mysqltsv.Writer(sys.stdout, headers=HEADERS)

    def process_path(path):
        f = mwcli.files.reader(path)

        return sample_tokens((json.loads(line) for line in f), rate)

    for values in para.map(process_path, paths):
        writer.write(values)
def process_all(paths):
    start = datetime.now()
    with open(os.path.join(working_dir, 'rev_ids.csv'), 'w') as outfile:
        for result in para.map(process_stub_history_filepath,
                               paths,
                               mappers=len(paths)):
            page_id, rev_id, rev_timestamp, rev_user_text, rev_user_id, is_revert_target, is_reverted, is_reverting = result
            outfile.write(
                f"{page_id},{rev_id},{rev_timestamp},{rev_user_text},{rev_user_id},{is_revert_target},{is_reverted},{is_reverting}\n"
            )
    print(f"{datetime.now() - start}")
Ejemplo n.º 7
0
def process_all(paths):
    git_root_dir = "/export/scratch2/levon003/repos/wiki-ores-feedback"
    derived_data_dir = os.path.join(git_root_dir, "data", "derived")
    working_dir = os.path.join(derived_data_dir, 'stub-history-all-revisions', 'oidb')
    os.makedirs(working_dir, exist_ok=True)
        
    start = datetime.now()
    processed_count = 0
    curr_batch = []
    
    with open(os.path.join(working_dir, 'pre2018_edit_counts_ungrouped.tsv'), 'w') as outfile:
        for result in para.map(process_stub_history_filepath, paths, mappers=len(paths)):
            user_id, count = result
            outfile.write(str(user_id) + "\t" + str(count) + "\n")
            processed_count += 1
            if processed_count % 100000 == 0:
                print(f"Processed {processed_count} users in {datetime.now() - start}")
    print(f"Finished processing {processed_count} users in {datetime.now() - start}")
Ejemplo n.º 8
0
def map(process, paths, threads=None):
    u"""
    Implements a distributed stategy for processing XML files.  This
    function constructs a set of py:mod:`multiprocessing` threads (spread over
    multiple cores) and uses an internal queue to aggregate outputs.  To use
    this function, implement a `process()` function that takes two arguments
    -- a :class:`mwxml.Dump` and the path the dump was loaded
    from. Anything that this function ``yield``s will be `yielded` in turn
    from the :func:`mwxml.map` function.

    :Parameters:
        paths : `iterable` ( `str` | `file` )
            a list of paths to dump files to process
        process : `func`
            A function that takes a :class:`~mwxml.iteration.dump.Dump` and the
            path the dump was loaded from and yields
        threads : int
            the number of individual processing threads to spool up

    :Example:

        >>> import mwxml
        >>> files = ["examples/dump.xml", "examples/dump2.xml"]
        >>>
        >>> def page_info(dump, path):
        ...     for page in dump:
        ...         yield page.id, page.namespace, page.title
        ...
        >>> for id, namespace, title in mwxml.map(page_info, files):
        ...     print(id, namespace, title)
        ...
    """
    paths = [mwtypes.files.normalize_path(path) for path in paths]

    def process_path(path):
        dump = Dump.from_file(mwtypes.files.reader(path))
        for x in process(dump, path):
            yield x

    for x in para.map(process_path, paths, mappers=threads):
        yield x
Ejemplo n.º 9
0
    def run(self, paths, threads, kwargs, output_dir, compression, verbose):

        def process_path(path):
            f = files.reader(path)
            input = self.file_reader(f)

            docs = self.a2b(input, verbose=verbose,
                            **kwargs)

            if output_dir == None:
                yield from docs
            else:
                new_path = files.output_dir_path(path, output_dir, compression)
                writer = files.writer(new_path)
                for doc in docs:
                    json.dump(doc, writer)
                    writer.write("\n")

        for doc in para.map(process_path, paths, mappers=threads):
            json.dump(doc, sys.stdout)
            sys.stdout.write("\n")
Ejemplo n.º 10
0
def map(process, paths, threads=None):
    """
    Implements a distributed stategy for processing XML files.  This
    function constructs a set of py:mod:`multiprocessing` threads (spread over
    multiple cores) and uses an internal queue to aggregate outputs.  To use
    this function, implement a `process()` function that takes two arguments
    -- a :class:`mwxml.Dump` and the path the dump was loaded
    from. Anything that this function ``yield``s will be `yielded` in turn
    from the :func:`mwxml.map` function.

    :Parameters:
        paths : `iterable` ( `str` | `file` )
            a list of paths to dump files to process
        process : `func`
            A function that takes a :class:`~mwxml.iteration.dump.Dump` and the
            path the dump was loaded from and yields
        threads : int
            the number of individual processing threads to spool up

    :Example:

        >>> import mwxml
        >>> files = ["examples/dump.xml", "examples/dump2.xml"]
        >>>
        >>> def page_info(dump, path):
        ...     for page in dump:
        ...         yield page.id, page.namespace, page.title
        ...
        >>> for id, namespace, title in mwxml.map(page_info, files):
        ...     print(id, namespace, title)
        ...
    """
    paths = [mwtypes.files.normalize_path(path) for path in paths]

    def process_path(path):
        dump = Dump.from_file(mwtypes.files.reader(path))
        yield from process(dump, path)

    yield from para.map(process_path, paths, mappers=threads)
def process_all(paths):
    git_root_dir = "/export/scratch2/levon003/repos/wiki-ores-feedback"
    derived_data_dir = os.path.join(git_root_dir, "data", "derived")
    working_dir = os.path.join(derived_data_dir, 'stub-history-all-revisions')
    os.makedirs(working_dir, exist_ok=True)
    start = datetime.now()
    with open(os.path.join(working_dir, 'rev_ids.csv'), 'w') as outfile:
        processed_count = 0
        for result in para.map(process_stub_history_filepath,
                               paths,
                               mappers=len(paths)):
            page_id, page_namespace, is_page_redirect, rev_id, rev_timestamp, rev_user_text, rev_user_id, is_revert_target, is_reverted, is_reverting = result
            outfile.write(
                f"{page_id},{page_namespace},{is_page_redirect},{rev_id},{rev_timestamp},{rev_user_text},{rev_user_id},{is_revert_target},{is_reverted},{is_reverting}\n"
            )
            processed_count += 1
            if processed_count % 1000000 == 0:
                print(
                    f"Processed {processed_count} revisions in {datetime.now() - start}"
                )
    print(
        f"Finished processing {processed_count} revisions in {datetime.now() - start}"
    )
Ejemplo n.º 12
0
def run(
    api_host,
    revisions,
    labels_f,
    trusted_groups,
    trusted_edits,
    revert_radius,
    revert_window,
    exclude_reverted,
    exclude_reverting,
    threads,
    verbose,
):

    # Construct our API session
    session = mwapi.Session(api_host, user_agent="editquality -- prelabeling script.")

    autolabel = autolabeler(
        session, trusted_groups, trusted_edits, revert_radius, revert_window, exclude_reverted, exclude_reverting
    )

    rev_id_chunks = chunk(revisions, 50)
    for revision in para.map(autolabel, rev_id_chunks, mappers=threads):
        if verbose:
            if not revision["autolabel"]["needs_review"]:
                sys.stderr.write(".")
            else:
                sys.stderr.write((revision["autolabel"]["review_reason"] or "?")[0])

            sys.stderr.flush()

        labels_f.write(json.dumps(revision))
        labels_f.write("\n")

    if verbose:
        sys.stderr.write("\n")
        sys.stderr.flush()
Ejemplo n.º 13
0
Or, to parallelize using ``MPI`` (assuming you have
``mpi4py`` installed and are running on a cluster), run

>>> mpi example.py

to send a PBS job to the queue. By default, the output will be 
printed to a log file in the same directory.

'''

from __future__ import division, print_function, absolute_import, unicode_literals
import para
import numpy as np

def quadratic(x, a, b, c):
  '''
  The function we're parallelizing
  
  '''
  
  print("[BEGIN JOB %d]" % x)
  
  for i in range(10**8):
    j = i
  
  print("[END JOB %d]" % x)
  
  return a * x ** 2 + b * x + c

for res in para.map(quadratic, np.arange(50), args = (1, 1, 1), kwargs = {}):
  print(res)