Esempio n. 1
0
def main():
    logger = logging.getLogger(__name__)

    # not used in this stub but often useful for finding various files
    project_dir = Path(__file__).resolve().parents[2]

    dbsnp_file = ("data/external/SHE/dbSNP/ds_ch24.xml.gz")
    dbsnp_path = project_dir / dbsnp_file

    # connect to database
    global_connection()

    # connect to database
    conn = MongoClient('mongodb://localhost:27017/',
                       username=os.getenv("MONGODB_SMARTER_USER"),
                       password=os.getenv("MONGODB_SMARTER_PASS"))

    # get my snp names from database
    pipeline = [{
        "$project": {
            "_id": 0,
            "name": 1
        }
    }, {
        "$group": {
            "_id": None,
            "total": {
                "$sum": 1
            },
            "items": {
                "$push": "$name"
            }
        }
    }]

    logger.info("Search all names in database")

    # execute the aggregation pipeline with pymongo client and get all
    # snp names from database
    result = next(conn[SMARTERDB]["variantSheep"].aggregate(pipeline))

    # get all names in a set
    all_snp_names = set(result["items"])

    logger.info(f"Reading from {dbsnp_path}")

    # cicle amoung dbsnp object
    for snp in filter(search_agr_bs, read_dbSNP(dbsnp_path)):
        found = False

        for i, ss in enumerate(snp["ss"]):
            if ss['locSnpId'] in all_snp_names:
                found = True
                break

        if found is False:
            logger.debug(f"Skipping rsId {snp['rsId']}")
            continue

        snp['ss'] = snp['ss'][i]

        # get the locSnpId to search (ex. OAR24_20639954
        name = snp['ss']['locSnpId']

        # regex is super slow
        qs = VariantSheep.objects(name=name)

        if qs.count() > 0:
            # get variant
            try:
                variant = qs.get()
                logger.info(f"{variant}: {snp}")

            except MultipleObjectsReturned:
                logger.error(qs.all())

            # debug
            break

    logger.info("Completed")
Esempio n. 2
0
    """Add or update a breed into SMARTER database"""

    logger.info(f"{Path(__file__).name} started")

    # get the dataset object
    dataset = Dataset.objects(file=dataset).get()

    # fix input parameters
    aliases = [BreedAlias(fid=fid, dataset=dataset) for fid in alias]
    species = species.capitalize()
    code = code.upper()

    # get a breed object relying on parameters
    breed, modified = get_or_create_breed(
        species=species, name=name, code=code, aliases=aliases)

    if modified:
        logger.info(f"{breed} added to database")

    logger.info(f"{Path(__file__).name} ended")


if __name__ == '__main__':
    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    logging.basicConfig(level=logging.INFO, format=log_fmt)

    # connect to database
    global_connection()

    main()
Esempio n. 3
0
def main(input_filepath, output_filepath, types):
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """

    logger = logging.getLogger(__name__)

    # connect to database
    global_connection()

    with open(input_filepath) as handle:
        reader = csv.reader(handle, delimiter=";")

        header = next(reader)

        # remove header id
        del (header[0])

        # sanitize column
        header = [sanitize(col) for col in header]

        logger.info("Got %s as header" % header)

        # define a datatype for my data
        Record = collections.namedtuple("Record", header)

        for line in reader:
            # remove id from record
            del (line[0])

            # remove empty string
            line = [col if col != '' else None for col in line]

            record = Record._make(line)
            logger.debug(record)

            # search for the archive file
            archive = next(project_dir.rglob(record.file))
            logger.info(f"Found {archive} as archive")

            archive = zipfile.ZipFile(archive)

            logger.debug("Get file contents")
            contents = archive.namelist()
            logger.debug(contents)

            # insert or update with a mongodb method
            dataset = Dataset.objects(
                file=record.file).upsert_one(**record._asdict(),
                                             type_=types,
                                             contents=contents)

            # ok extract content to working directory
            # TODO: don't work with plain text files, try to work with
            # compressed data
            working_dir = project_dir / f"data/interim/{dataset.id}"
            working_dir.mkdir(exist_ok=True)

            for member in contents:
                test = working_dir / member
                if not test.exists():
                    logger.info(f"Extract '{member}': in '{working_dir}'")
                    archive.extract(member, working_dir)

                else:
                    logger.debug(f"Skipping {member}: already extracted")

    with open(output_filepath, "w") as handle:
        # after insert collect all data of the same type
        handle.write(Dataset.objects.to_json(indent=2))

    logger.info(f"Data written into database and in {output_filepath}")