Exemple #1
0
    def tearDownClass(cls):
        # delete created objects
        Breed.objects().delete()
        Counter.objects().delete()
        Dataset.objects().delete()

        super().tearDownClass()
def main(species, assembly):
    logger.info(f"{Path(__file__).name} started")

    # find assembly configuration
    if assembly not in WORKING_ASSEMBLIES:
        raise Exception(f"assembly {assembly} not managed by smarter")

    # open a file to track files to merge
    smarter_tag = "SMARTER-{specie}-{assembly}-top-{version}".format(
        specie=SPECIES2CODE[species.capitalize()],
        assembly=assembly.upper(),
        version=__version__)
    merge_file = get_interim_dir() / smarter_tag

    with merge_file.open(mode="w") as handle:
        for dataset in Dataset.objects(species=species.capitalize()):
            logger.debug(f"Got {dataset}")

            # search for result dir
            results_dir = Path(dataset.result_dir) / assembly.upper()

            if results_dir.exists():
                logger.info(f"Found {results_dir}")

                # search for bed files
                bed_files = results_dir.glob('*.bed')

                # I can have more than 1 file for dataset (If one or more
                # files are included into dataset)
                for bed_file in bed_files:
                    # determine the bedfile full path
                    prefix = results_dir / bed_file.stem

                    logger.info(f"Appeding {prefix} for merge")

                    # track file to merge
                    handle.write(f"{prefix}\n")

    # ok check for results dir
    final_dir = get_processed_dir() / assembly
    final_dir.mkdir(parents=True, exist_ok=True)

    # ok time to convert data in plink binary format
    cmd = ["plink"] + PLINK_SPECIES_OPT[dataset.species] + [
        "--merge-list", f"{merge_file}", "--make-bed", "--out",
        f"{final_dir / smarter_tag}"
    ]

    # debug
    logger.info("Executing: " + " ".join(cmd))

    subprocess.run(cmd, check=True)

    logger.info(f"{Path(__file__).name} ended")
Exemple #3
0
def fetch_and_check_dataset(archive: str,
                            contents: list[str]) -> [Dataset, list[Path]]:
    """Common operations on dataset: fetch a dataset by file (submitted
    archive), check that working dir exists and required file contents is
    in dataset. Test and get full path of required files

    Args:
        archive (str): the dataset archive (file)
        contents (list): a list of files which beed to be defined in dataset

    Returns:
        Dataset: a dataset instance
        list[Path]: a list of Path of required files
    """

    # get the dataset object
    dataset = Dataset.objects(file=archive).get()

    logger.debug(f"Found {dataset}")

    # check for working directory
    working_dir = dataset.working_dir

    if not working_dir.exists():
        raise FileNotFoundError(
            f"Could find dataset directory '{working_dir}'")

    # check files are in dataset
    not_found = []
    contents_path = []

    for item in contents:
        if item not in dataset.contents:
            logger.error(f"Couldn't find '{item}' in dataset: '{dataset}'")
            not_found.append(item)
            continue

        item_path = working_dir / item

        if not item_path.exists():
            logger.error(
                f"Couldn't find '{item_path}' in dir: '{working_dir}'")
            not_found.append(item)
            continue

        contents_path.append(item_path)

    if len(not_found) > 0:
        raise FileNotFoundError(f"Couldn't find '{not_found}'")

    return dataset, contents_path
Exemple #4
0
    def test_update_pedfile(self):
        # get a dataset
        dataset = Dataset.objects(file="test.zip").get()

        # create a temporary directory using the context manager
        with tempfile.TemporaryDirectory() as tmpdirname:
            outfile = pathlib.Path(tmpdirname) / "plinktest_updated.ped"
            self.plinkio.update_pedfile(str(outfile), dataset, 'ab', fid="TEX")

            # now open outputfile and test stuff
            test = TextPlinkIO(mapfile=str(DATA_DIR / "plinktest.map"),
                               pedfile=str(outfile))

            # assert two records written
            self.assertEqual(len(list(test.read_pedfile())), 2)
Exemple #5
0
    def test_process_pedline(self):
        # define reference
        reference = [
            'TEX', 'ITOA-TEX-000000001', '0', '0', '0', -9, 'A', 'A', 'G', 'G'
        ]

        # get a line for testing
        line = self.lines[0]

        # get a dataset
        dataset = Dataset.objects(file="test.zip").get()

        test = self.plinkio._process_pedline(line, dataset, 'ab')

        self.assertEqual(reference, test)
Exemple #6
0
    def setUp(self):
        super().setUp()

        self.plinkio = TextPlinkIO(prefix=str(DATA_DIR / "plinktest"),
                                   species="Sheep")

        # read info from map
        self.plinkio.read_mapfile()
        self.plinkio.fetch_coordinates(version="Oar_v3.1",
                                       imported_from="SNPchiMp v.3")

        # read first line of ped file
        self.lines = list(self.plinkio.read_pedfile())

        # get a dataset
        self.dataset = Dataset.objects(file="test.zip").get()
def main(species, name, code, alias, dataset):
    """Add or update a breed into SMARTER database"""

    logger.info(f"{Path(__file__).name} started")

    # get the dataset object
    dataset = Dataset.objects(file=dataset).get()

    # fix input parameters
    aliases = [BreedAlias(fid=fid, dataset=dataset) for fid in alias]
    species = species.capitalize()
    code = code.upper()

    # get a breed object relying on parameters
    breed, modified = get_or_create_breed(
        species=species, name=name, code=code, aliases=aliases)

    if modified:
        logger.info(f"{breed} added to database")

    logger.info(f"{Path(__file__).name} ended")
def main(file_, bfile, dataset, coding, chip_name, assembly):
    """Read sample names from map/ped files and updata smarter database (insert
    a record if necessary and define a smarter id for each sample)
    """

    logger.info(f"{Path(__file__).name} started")

    # find assembly configuration
    if assembly not in WORKING_ASSEMBLIES:
        raise Exception(f"assembly {assembly} not managed by smarter")

    assembly_conf = WORKING_ASSEMBLIES[assembly]

    # get the dataset object
    dataset = Dataset.objects(file=dataset).get()

    logger.debug(f"Found {dataset}")

    if file_:
        plinkio, output_dir, output_map, output_ped = deal_with_text_plink(
            file_, dataset, assembly)

    elif bfile:
        plinkio, output_dir, output_map, output_ped = deal_with_binary_plink(
            bfile, dataset, assembly)

    # check chip_name
    illumina_chip = IlluminaChip.objects(name=chip_name).get()

    # set chip name for this sample
    plinkio.chip_name = illumina_chip.name

    # test if I have already run this analysis

    # ok check for results dir
    results_dir = dataset.result_dir
    results_dir = results_dir / assembly
    results_dir.mkdir(parents=True, exist_ok=True)

    # define final filename
    final_prefix = results_dir / output_ped.stem

    # test for processed files existance
    if plink_binary_exists(final_prefix):
        logger.warning(f"Skipping {dataset} processing: {final_prefix} exists")
        logger.info(f"{Path(__file__).name} ended")
        return

    # if I arrive here, I can create output files

    # read mapdata and read updated coordinates from db
    plinkio.read_mapfile()

    # fetch coordinates relying assembly configuration
    plinkio.fetch_coordinates(version=assembly_conf.version,
                              imported_from=assembly_conf.imported_from)

    logger.info("Writing a new map file with updated coordinates")
    plinkio.update_mapfile(str(output_map))

    logger.info("Writing a new ped file with fixed genotype")
    plinkio.update_pedfile(output_ped, dataset, coding)

    # ok time to convert data in plink binary format
    cmd = ["plink"] + PLINK_SPECIES_OPT[dataset.species] + [
        "--file", f"{output_dir / output_ped.stem}", "--make-bed", "--out",
        f"{final_prefix}"
    ]

    # debug
    logger.info("Executing: " + " ".join(cmd))

    subprocess.run(cmd, check=True)

    logger.info(f"{Path(__file__).name} ended")
Exemple #9
0
def main(input_filepath, output_filepath, types):
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """

    logger = logging.getLogger(__name__)

    # connect to database
    global_connection()

    with open(input_filepath) as handle:
        reader = csv.reader(handle, delimiter=";")

        header = next(reader)

        # remove header id
        del (header[0])

        # sanitize column
        header = [sanitize(col) for col in header]

        logger.info("Got %s as header" % header)

        # define a datatype for my data
        Record = collections.namedtuple("Record", header)

        for line in reader:
            # remove id from record
            del (line[0])

            # remove empty string
            line = [col if col != '' else None for col in line]

            record = Record._make(line)
            logger.debug(record)

            # search for the archive file
            archive = next(project_dir.rglob(record.file))
            logger.info(f"Found {archive} as archive")

            archive = zipfile.ZipFile(archive)

            logger.debug("Get file contents")
            contents = archive.namelist()
            logger.debug(contents)

            # insert or update with a mongodb method
            dataset = Dataset.objects(
                file=record.file).upsert_one(**record._asdict(),
                                             type_=types,
                                             contents=contents)

            # ok extract content to working directory
            # TODO: don't work with plain text files, try to work with
            # compressed data
            working_dir = project_dir / f"data/interim/{dataset.id}"
            working_dir.mkdir(exist_ok=True)

            for member in contents:
                test = working_dir / member
                if not test.exists():
                    logger.info(f"Extract '{member}': in '{working_dir}'")
                    archive.extract(member, working_dir)

                else:
                    logger.debug(f"Skipping {member}: already extracted")

    with open(output_filepath, "w") as handle:
        # after insert collect all data of the same type
        handle.write(Dataset.objects.to_json(indent=2))

    logger.info(f"Data written into database and in {output_filepath}")