Exemple #1
0
def get_graph(**options):
    graph = bonobo.Graph()
    graph.add_chain(normalize, write_repr_to_file, _input=None)
    graph.add_chain(extract_fablabs, bonobo.Limit(5), _output=normalize)
    graph.add_chain(json_loader('data.json'),
                    bonobo.Limit(10),
                    _output=normalize,
                    _name="loadjson")
    return graph
Exemple #2
0
def get_graph_options(options):
    _limit = options.pop("limit", None)
    _print = options.pop("print", False)

    return {
        "_limit": (bonobo.Limit(_limit), ) if _limit else (),
        "_print": (bonobo.PrettyPrinter(), ) if _print else ()
    }
def get_graph(**options):
    graph = bonobo.Graph()
    graph.add_chain(
    getotherdata,
    bonobo.Limit(10),
    bonobo.PrettyPrinter(),
    )
    return graph
Exemple #4
0
def get_graph(**options):
    graph = bonobo.Graph()
    graph.add_chain(
        extract_fablabs,
        bonobo.Limit(10),
        write_repr_to_file,
    )
    return graph
Exemple #5
0
def test_limit_default():
    context, results = MagicMock(), []

    with ContextCurrifier(bonobo.Limit()).as_contextmanager(context) as stack:
        for i in range(20):
            results += list(stack())

    assert results == [NOT_MODIFIED] * 10
Exemple #6
0
def get_graph_options(options):
    _limit = options.pop('limit', None)
    _print = options.pop('print', False)

    return {
        '_limit': (bonobo.Limit(_limit), ) if _limit else (),
        '_print': (bonobo.PrettyPrinter(), ) if _print else (),
    }
def get_graph(**options):
    graph = bonobo.Graph()
    graph.add_chain(
        extract_fablabs,
        bonobo.Limit(10),
        bonobo.PrettyPrinter(),
    )
    return graph
Exemple #8
0
def get_graph(**options):
    graph = bonobo.Graph()

    # # Import authors
    # graph.add_chain(
    #     bonobo.CsvReader('data/ff-faculty.csv', skip=1),
    #     bonobo.Limit(limit),
    #     create_author_document,
    #     FilterDuplicate(collection="jhu-authors", field='hopkins_id', target='hopkins_id', database=database),
    #     MongoWriter(collection='jhu-authors', database=database),
    # )
    #
    # # Retreive authors from scopus
    # graph.add_chain(
    #     extract_author_scopus_ids,
    #     bonobo.Limit(limit),
    #     FilterDuplicate(collection='scopus-authors', database=database),
    #     get_author,
    #     MongoWriter(collection='scopus-authors', database=database),
    #     _input=create_author_document,
    # )

    # Retrieve documents from scopus
    graph.add_chain(
        bonobo.CsvReader('data/ff-article-ids-17.csv'),
        bonobo.Limit(limit),
        FilterDuplicate(collection='scopus-documents', database=database),
        get_document,
        # Keep errata data. Leave it to downstream analysis. Otherwise it'll be repeatedly downloaded and discarded.
        # remove_errata,
        MongoWriter(collection='scopus-documents', database=database),
    )

    # Extract serials data from Scopus and load into MongoDB
    graph.add_chain(lambda args: args['coredata'].get('source-id', None),
                    bonobo.Limit(limit),
                    FilterDuplicate(collection='scopus-serials',
                                    database=database),
                    get_serial,
                    MongoWriter(collection='scopus-serials',
                                database=database),
                    _input=get_document)

    return graph
Exemple #9
0
def get_graph_options(options):
    logger.debug("Unpacking command line options %s.", options)
    _limit = options.pop("limit", None)
    _print = options.pop("print", False)
    graph_options = {
        "_limit": (bonobo.Limit(_limit), ) if _limit else (),
        "_print": (bonobo.PrettyPrinter(), ) if _print else ()
    }
    logger.debug("Created graph options %s.", graph_options)
    return graph_options
Exemple #10
0
def get_graph(**options):
    """
    This function builds the graph that needs to be executed.

    :return: bonobo.Graph

    """
    graph = bonobo.Graph()
    graph.add_chain(extract_fablabs,bonobo.Limit(10),bonobo.PrettyPrinter())

    return graph
Exemple #11
0
def get_graph(**options):
    graph = bonobo.Graph()

    graph.add_chain(
        extract,
        bonobo.Limit(10),
        #bonobo.PrettyPrinter(),
        transform,
        load,
    )

    return graph
Exemple #12
0
def get_graph(*, _limit=None, _print=False):
    graph = bonobo.Graph()

    trunk = graph.add_chain(bonobo.JsonReader('datasets/theaters.json'), *((bonobo.Limit(_limit),) if _limit else ()))

    if _print:
        graph.add_chain(bonobo.PrettyPrinter(), _input=trunk.output)

    graph.add_chain(bonobo.JsonWriter('theaters.json', fs='fs.output'), _input=trunk.output)
    graph.add_chain(bonobo.LdjsonWriter('theaters.ldjson', fs='fs.output'), _input=trunk.output)

    return graph
Exemple #13
0
def get_graph(**options):
    # Elimino el archivo de reporte para empezar de cero
    if os.path.isfile(reporte) is True:
        os.remove(reporte)

    graph = bonobo.Graph()
    graph.add_chain(
        extract,
        # Limito la cantidad de informacion que fluye al siguiente eslavon
        bonobo.Limit(20),
        transform,
        load)
    return graph
Exemple #14
0
def get_graph(*, _limit=None, _print=False):
    graph = bonobo.Graph()

    trunk = graph.add_chain(bonobo.JsonReader("theaters.json", fs="fs.static"),
                            *((bonobo.Limit(_limit), ) if _limit else ()))

    if _print:
        graph.add_chain(bonobo.PrettyPrinter(), _input=trunk.output)

    graph.add_chain(bonobo.JsonWriter("theaters.output.json", fs="fs.output"),
                    _input=trunk.output)
    graph.add_chain(bonobo.LdjsonWriter("theaters.output.ldjson",
                                        fs="fs.output"),
                    _input=trunk.output)

    return graph
Exemple #15
0
    def handle(
        self,
        input_filename,
        output_filename,
        reader=None,
        reader_option=None,
        writer=None,
        writer_option=None,
        option=None,
        limit=None,
        transformation=None,
    ):
        reader_factory = default_registry.get_reader_factory_for(
            input_filename, format=reader)
        reader_kwargs = _resolve_options((option or []) +
                                         (reader_option or []))

        if output_filename == '-':
            writer_factory = bonobo.PrettyPrinter
            writer_args = ()
        else:
            writer_factory = default_registry.get_writer_factory_for(
                output_filename, format=writer)
            writer_args = (output_filename, )
        writer_kwargs = _resolve_options((option or []) +
                                         (writer_option or []))

        transformations = ()

        if limit:
            transformations += (bonobo.Limit(limit), )

        transformations += _resolve_transformations(transformation)

        graph = bonobo.Graph()
        graph.add_chain(
            reader_factory(input_filename, **reader_kwargs),
            *transformations,
            writer_factory(*writer_args, **writer_kwargs),
        )

        return bonobo.run(graph, services={
            'fs': bonobo.open_fs(),
        })
Exemple #16
0
def get_graph(**options):
    """
    This function builds the graph that needs to be executed.

    :return: bonobo.Graph

    """
    graph = bonobo.Graph()
    graph.add_chain(
        get_docs_by_year(2018, False), extract_id, bonobo.Limit(2),
        FilterDuplicate(collection='document', field='_id', database='test'),
        get_document, MongoWriter(collection='document', database='test'))
    # Author
    graph.add_chain(
        get_authors_from_doc,
        FilterDuplicate(collection='author', field='@auid', database='test'),
        lambda args: args['@auid'],
        get_author,
        MongoWriter(collection='author', database='test'),
        # bonobo.JsonWriter('results/authors.json'),
        _input=get_document)
    # Author Affiliation
    # graph.add_chain(
    #     get_author_affl,
    #     bonobo.UnpackItems(0),
    #     bonobo.CsvWriter('results/author-affl.csv'),
    #     _input=get_author
    # )
    # # Affiliations - Skip. Instead, use the affiliation API to retrieve JHU affiliations
    # graph.add_chain(
    #     get_author_affl,
    #     FilterDuplicate(collection='affiliation', field='affiliation'),
    #     lambda args: args['affiliation'],
    #     get_affiliation,
    #     MongoWriter(collection='affiliation'),
    #     _input=get_author
    # )
    # Serial By ID
    graph.add_chain(lambda args: args['coredata'].get('source-id', None),
                    FilterDuplicate(collection='serial', database='test'),
                    get_serial,
                    MongoWriter(collection='serial', database='test'),
                    _input=get_document)
    return graph
Exemple #17
0
def get_graph(*, _limit=None, _print=False):
    return bonobo.Graph(bonobo.CsvReader("coffeeshops.csv"),
                        *((bonobo.Limit(_limit), ) if _limit else ()),
                        *((bonobo.PrettyPrinter(), ) if _print else ()),
                        bonobo.CsvWriter("coffeeshops.csv", fs="fs.output"))
    print(str(args))


def with_opened_file(self, context):
    with open('output_csv.txt', 'w+') as f:
        yield f


# decorator is used here: every time we open the file, and append row to the existing rows, instead of overwriting it
# Or directly use load (not writing to file)
@use_context_processor(with_opened_file)
def write_repr_to_file(f, *row):
    f.write(repr(row) + "\n")


# if we don't use decorator, only one record will be written (will over-write the old records)
def write_to_file_onetime(*row):
    with open('output_csv_trial.txt', 'w+') as f:
        f.write(repr(row) + "\n")


if __name__ == '__main__':
    graph = bonobo.Graph()
    graph.add_chain(
        get_price,
        transform,
        bonobo.Limit(20),
        write_repr_to_file,
    )
    bonobo.run(graph)
Exemple #19
0
    data = pd.read_csv('train.csv', encoding='ISO-8859-1')

    # construct category dictionary (map the category to index)
    category_dict = defaultdict()
    category_list = data.columns.tolist()
    for i in range(len(category_list)):
        category_dict[category_list[i]] = i
    print(category_dict)

    # construct the terms dictionary (get the dict from data descriptions file)
    term_dictionary = getdict()
    print(term_dictionary)

    # write the terms dictionary into two-column csv
    write_dict_to_csv()

    # divide the table into 3 sub-tables
    # TODO: The index needs to be changed according to the requirements
    lotinfo_idx = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

    # build Bonobo pipeline
    graph = bonobo.Graph()
    graph.add_chain(
        extract,
        # the transform step will replace the abbr. with its full description
        transform,
        bonobo.Limit(100),
        write_repr_to_file,
    )
    bonobo.run(graph)
Exemple #20
0
def get_graph(**options):
    """
    This function builds the graph that needs to be executed.

    :return: bonobo.Graph

    """
    graph = bonobo.Graph()

    graph.add_chain(
        bonobo.CsvWriter('billing.csv'),
        bonobo.JsonWriter('billing.json'),
        invalid_entries,
        fix_numbers,
        parse_dates,
        #bonobo.PrettyPrinter(),
        filter_summary,
        #bonobo.PrettyPrinter(),
        lookup_account_sk,
        lookup_date_sk,
        summarize_costs,
        bonobo.UnpackItems(0),
        bonobo_sqlalchemy.InsertOrUpdate(
            table_name='fact_itsm_aws_historical_cost' +
            options['table_suffix'],
            discriminant=(
                'productname',
                'date_sk',
                'account_name_sk',
            ),
            engine='database'),
        _name="main",
        _input=None,
    )

    now = options['now']

    # Go to beginning of month
    now += relativedelta(day=1, hour=0, minute=0, second=0, microsecond=0)

    when = now
    for log in range(0, options['months']):
        when = when + relativedelta(months=-1)
        tstamp = when.strftime("%Y-%m")
        print("# %d Processing %s" % (log, tstamp))
        if options['limit']:
            _limit = (bonobo.Limit(options['limit']), )
        else:
            _limit = ()

        graph.add_chain(
            AwsBillingReader('%s-aws-cost-allocation-%s.csv' %
                             (options['aws_account_id'], tstamp),
                             fs='s3',
                             skip=1),
            *_limit,
            _output="main",
        )

    graph.add_chain(
        bonobo_sqlalchemy.InsertOrUpdate(
            table_name=options['table'] + options['table_suffix'],
            discriminant=('invoiceid', 'linkedaccountid', 'payeraccountid',
                          'recordid'),
            engine='database'),
        _input=parse_dates,
    )

    return graph
Exemple #21
0
def get_graph(**options):
    """
    This function builds the graph that needs to be executed.

    :return: bonobo.Graph

    """
    graph = bonobo.Graph()

    # Read data from the CSV file and load into MongoDB
    graph.add_chain(
        bonobo.CsvReader('data/biophysics-author-names.csv'),
        bonobo.Limit(limit),
        get_author_by_name,
        create_author_document,
        MongoWriter(collection='jhu-authors', database=database),
    )

    # Extract authors from Scopus and load into MongoDB
    graph.add_chain(
        extract_authors,
        bonobo.Limit(limit),
        extract_id,
        FilterDuplicate(collection='scopus-authors', database=database),
        get_author,
        MongoWriter(collection='scopus-authors', database=database),
        _input=get_author_by_name
    )

    # Extract documents from Scopus and load into MongoDB
    graph.add_chain(
        extract_id,
        get_docs_by_author,
        bonobo.Limit(limit),
        extract_id,
        FilterDuplicate(collection='scopus-documents', field='_id', database=database),
        get_document,
        remove_errata,
        MongoWriter(collection='scopus-documents', database=database),
        _input=extract_authors
    )

    # Extract serials data from Scopus and load into MongoDB
    graph.add_chain(
        lambda args: args['coredata'].get('source-id', None),
        bonobo.Limit(limit),
        FilterDuplicate(collection='scopus-serials', database=database),
        get_serial,
        MongoWriter(collection='scopus-serials', database=database),
        _input=remove_errata
    )

    # Extract co-authors data from Scopus and load into MongoDB
    graph.add_chain(
        get_authors_from_doc,
        bonobo.Limit(limit),
        FilterDuplicate(collection='scopus-authors', field='@auid', database=database),
        lambda args: args['@auid'],
        get_author,
        MongoWriter(collection='scopus-authors', database=database),
        # bonobo.JsonWriter('results/authors.json'),
        _input=remove_errata
    )

    return graph
Exemple #22
0
def get_graph(*, _limit=None, _print=False):
    return bonobo.Graph(bonobo.CsvReader('datasets/coffeeshops.txt'),
                        *((bonobo.Limit(_limit), ) if _limit else ()),
                        *((bonobo.PrettyPrinter(), ) if _print else ()),
                        bonobo.CsvWriter('coffeeshops.csv', fs='fs.output'))
Exemple #23
0
def test_limit():
    limit = bonobo.Limit(2)
    results = []
    for i in range(42):
        results += list(limit())
    assert results == [NOT_MODIFIED] * 2
Exemple #24
0
def test_limit_not_there():
    limit = bonobo.Limit(42)
    results = []
    for i in range(10):
        results += list(limit())
    assert results == [NOT_MODIFIED] * 10
Exemple #25
0
def get_graph():
    graph = bonobo.Graph()
    graph.add_chain(extract, transform, load_csv, load_postgres,
                    bonobo.Limit(1000), bonobo.PrettyPrinter())
    return graph