Beispiel #1
0
def process_folder(result_folder, qai_server, qai_user, qai_password,
                   pipeline_version):
    logger.info('Uploading data to Oracle from {}'.format(result_folder))
    collated_conseqs = os.path.join(result_folder, 'conseq.csv')
    collated_counts = os.path.join(result_folder, 'remap_counts.csv')
    cascade = os.path.join(result_folder, 'cascade.csv')
    coverage_scores = os.path.join(result_folder, 'coverage_scores.csv')
    all_results_path, _ = os.path.split(os.path.normpath(result_folder))
    run_path, _ = os.path.split(all_results_path)
    sample_sheet_file = os.path.join(run_path, "SampleSheet.csv")
    with open(sample_sheet_file, "rU") as f:
        sample_sheet = sample_sheet_parser.sample_sheet_parser(f)

    ok_sample_regions = load_ok_sample_regions(result_folder)

    with qai_helper.Session() as session:
        session.login(qai_server, qai_user, qai_password)

        run = find_run(session, sample_sheet["Experiment Name"])

        with open(collated_conseqs, "rU") as f:
            conseqs = build_conseqs(f, run, sample_sheet, ok_sample_regions)
        with open(coverage_scores, "rU") as f, \
                open(collated_counts, "rU") as f2, \
                open(cascade, "rU") as f3:
            upload_review_to_qai(f, f2, f3, run, sample_sheet, conseqs,
                                 session, pipeline_version)
Beispiel #2
0
    def download_quality(self, folder):
        """ Download quality control data for the run.

        @return path for the quality CSV file
        """
        trimmed_folder = self.trim_folder(folder)
        destination_folder = os.path.join(settings.home,
                                          os.path.basename(folder))
        destination = os.path.join(destination_folder,
                                   '{}_quality.csv'.format(trimmed_folder))
        if not os.path.exists(destination_folder):
            os.makedirs(destination_folder)

        run_info_path = os.path.join(folder, 'RunInfo.xml')
        run_info = self.parse_run_info(run_info_path)
        with qai_helper.Session() as session:
            session.login(settings.qai_path, settings.qai_user,
                          settings.qai_password)
            metrics = session.get_json('/miseqqc_errormetrics?runid=' +
                                       run_info.miseq_run_id)
            if not metrics:
                raise RuntimeError(
                    'No quality control metrics found for run ' +
                    run_info.miseq_run_id)

        with open(destination, 'w') as f:
            self.write_quality(f, metrics, run_info)
        return destination
Beispiel #3
0
def main():
    args = parse_args()
    dump = {}
    used_regions = set()
    with qai_helper.Session() as session:
        session.login(args.qai_server, args.qai_user, args.qai_password)

        dump['regions'] = session.get_json("/lab_miseq_regions?mode=dump",
                                           retries=0)
        dump['projects'] = session.get_json(
            "/lab_miseq_projects?mode=dump&pipeline=" + args.pipeline_version,
            retries=0)
        empty_projects = []
        for name, project in dump['projects'].items():
            project['regions'].sort(key=itemgetter('coordinate_region'))
            for region in project['regions']:
                used_regions.add(region['coordinate_region'])
                used_regions.update(region['seed_region_names'])
            if not project['regions']:
                empty_projects.append(name)
        for name in empty_projects:
            del dump['projects'][name]
        errors = dump['projects'].get('errors')
        if errors:
            raise RuntimeError('\n'.join(errors))
        check_key_positions(dump['projects'], sys.stdout)
    dump['regions'] = {
        key: value
        for key, value in dump['regions'].items() if key in used_regions
    }

    dump_scoring = deepcopy(dump)
    for project in dump['projects'].values():
        for region in project['regions']:
            del region['key_positions']
            del region['min_coverage1']
            del region['min_coverage2']
            del region['min_coverage3']

    dump_json(dump, "../projects.json")

    for project in dump_scoring['projects'].values():
        for region in project['regions']:
            name = region['coordinate_region']
            seq = ''.join(dump_scoring['regions'][name]['reference'])
            region['coordinate_region_length'] = len(seq)
    del dump_scoring['regions']
    dump_json(dump_scoring, "../project_scoring.json")

    print("Done.")
Beispiel #4
0
def upload_loop(qai_server, qai_user, qai_password, pipeline_version,
                upload_queue):
    # noinspection PyBroadException
    try:
        with qai_helper.Session() as session:
            # Try logging in to QAI, just so we learn about problems at launch.
            session.login(qai_server, qai_user, qai_password)
    except Exception:
        logger.error('Unable to log in to QAI.', exc_info=True)

    while True:
        item = upload_queue.get()
        if item is None:
            break
        process_folder(item, qai_server, qai_user, qai_password,
                       pipeline_version)
Beispiel #5
0
def main():
    dump = {}
    with qai_helper.Session() as session:
        session.login(settings.qai_project_path, settings.qai_project_user,
                      settings.qai_project_password)

        dump['regions'] = session.get_json("/lab_miseq_regions?mode=dump",
                                           retries=0)
        dump['projects'] = session.get_json(
            "/lab_miseq_projects?mode=dump&pipeline=" +
            settings.pipeline_version,
            retries=0)
        for project in dump['projects'].itervalues():
            project['regions'].sort()
        errors = dump['projects'].get('errors')
        if errors:
            raise StandardError('\n'.join(errors))
        check_key_positions(dump['projects'], sys.stdout)

    dump_scoring = deepcopy(dump)
    for project in dump['projects'].itervalues():
        for region in project['regions']:
            del region['key_positions']
            del region['min_coverage1']
            del region['min_coverage2']
            del region['min_coverage3']

    dump_json(dump, "../projects.json")

    for project in dump_scoring['projects'].itervalues():
        for region in project['regions']:
            name = region['coordinate_region']
            seq = ''.join(dump_scoring['regions'][name]['reference'])
            region['coordinate_region_length'] = len(seq)
    del dump_scoring['regions']
    dump_json(dump_scoring, "../project_scoring.json")

    print "Done."
Beispiel #6
0
def main():
    project_config = ProjectConfig.loadDefault()
    with open('../project_scoring.json', 'rU') as scoring_file:
        scoring_config = json.load(scoring_file)
    with qai_helper.Session() as session:
        session.login(settings.qai_path, settings.qai_user,
                      settings.qai_password)

        pipelines = session.get_json("/lab_miseq_pipelines?version=" +
                                     settings.pipeline_version,
                                     retries=0)
        if pipelines:
            raise RuntimeError('Pipeline {} already exists.'.format(
                settings.pipeline_version))

        seed_groups = session.get_json("/lab_miseq_seed_groups")
        seed_group_ids = dict(map(itemgetter('name', 'id'), seed_groups))
        old_regions = session.get_json("/lab_miseq_regions", retries=0)
        regions = dict(((region['name'], region) for region in old_regions))
        for region_name, region_data in project_config.config[
                'regions'].iteritems():
            region = regions.get(region_name)
            if region is None:
                seed_group_name = region_data['seed_group']
                seed_group_id = seed_group_ids.get(seed_group_name)
                if seed_group_id is None and seed_group_name:
                    seed_group = session.post_json("/lab_miseq_seed_groups",
                                                   {'name': seed_group_name})
                    seed_group_id = seed_group['id']
                    seed_group_ids[seed_group_name] = seed_group_id
                region = session.post_json(
                    "/lab_miseq_regions", {
                        'name': region_name,
                        'is_nucleotide': region_data['is_nucleotide'],
                        'reference': ''.join(region_data['reference']),
                        'seed_group_id': seed_group_id
                    })
                regions[region_name] = region

        pipeline = session.post_json("/lab_miseq_pipelines",
                                     {'version': settings.pipeline_version})
        pipeline_id = pipeline['id']

        old_projects = session.get_json("/lab_miseq_projects", retries=0)
        projects = dict(
            ((project['name'], project) for project in old_projects))
        for project_name, project_data in project_config.config[
                'projects'].iteritems():
            project = projects.get(project_name)
            if project is None:
                project = session.post_json(
                    "/lab_miseq_projects", {
                        'name': project_name,
                        'max_variants': project_data['max_variants']
                    })
            project_version = session.post_json("/lab_miseq_project_versions",
                                                {
                                                    'pipeline_id': pipeline_id,
                                                    'project_id': project['id']
                                                })
            for i, region_data in enumerate(project_data['regions']):
                scoring_data = scoring_config['projects'][project_name][
                    'regions'][i]
                coordinate_region = regions[region_data['coordinate_region']]
                seed_region = regions[region_data['seed_region_names'][0]]
                seed_group_id = seed_region['seed_group_id']
                project_region = session.post_json(
                    "/lab_miseq_project_regions", {
                        'project_version_id': project_version['id'],
                        'coordinate_region_id': coordinate_region['id'],
                        'min_coverage1': scoring_data['min_coverage1'],
                        'min_coverage2': scoring_data['min_coverage2'],
                        'min_coverage3': scoring_data['min_coverage3'],
                        'seed_group_id': seed_group_id
                    })

                for key_position in scoring_data['key_positions']:
                    session.post_json(
                        "/lab_miseq_key_positions", {
                            'project_region_id': project_region['id'],
                            'start_pos': key_position['start_pos'],
                            'end_pos': key_position['end_pos']
                        })

    print "Done."
Beispiel #7
0
def main():
    args = parse_args()
    project_config = ProjectConfig.loadDefault()
    scoring_path = Path(__file__).parent.parent / 'project_scoring.json'
    with scoring_path.open() as scoring_file:
        scoring_config = json.load(scoring_file)
    with qai_helper.Session() as session:
        session.login(args.qai_server, args.qai_user, args.qai_password)

        pipelines = session.get_json("/lab_miseq_pipelines?version=" +
                                     args.pipeline_version,
                                     retries=0)
        if pipelines:
            raise RuntimeError('Pipeline {} already exists.'.format(
                args.pipeline_version))

        seed_groups = session.get_json("/lab_miseq_seed_groups")
        # noinspection PyTypeChecker
        seed_group_ids = dict(map(itemgetter('name', 'id'), seed_groups))
        old_regions = session.get_json("/lab_miseq_regions", retries=0)
        regions = dict(((region['name'], region) for region in old_regions))
        for region_name, region_data in project_config.config['regions'].items(
        ):
            ref_seq = ''.join(region_data['reference'])
            region = regions.get(region_name)
            if region is None:
                seed_group_name = region_data['seed_group']
                seed_group_id = seed_group_ids.get(seed_group_name)
                if seed_group_id is None and seed_group_name:
                    seed_group = session.post_json("/lab_miseq_seed_groups",
                                                   {'name': seed_group_name})
                    seed_group_id = seed_group['id']
                    seed_group_ids[seed_group_name] = seed_group_id
                region = session.post_json(
                    "/lab_miseq_regions", {
                        'name': region_name,
                        'is_nucleotide': region_data['is_nucleotide'],
                        'reference': ref_seq,
                        'seed_group_id': seed_group_id
                    })
                regions[region_name] = region
            elif region['reference'] != ref_seq:
                print("Reference doesn't match:", region_name)
                if args.update_sequences:
                    region['reference'] = ref_seq
                    session.post_json(f"/lab_miseq_regions/{region['id']}",
                                      region)

        pipeline = session.post_json("/lab_miseq_pipelines",
                                     {'version': args.pipeline_version})
        pipeline_id = pipeline['id']

        old_projects = session.get_json("/lab_miseq_projects", retries=0)
        projects = dict(
            ((project['name'], project) for project in old_projects))
        for project_name, project_data in project_config.config[
                'projects'].items():
            project = projects.get(project_name)
            if project is None:
                project = session.post_json(
                    "/lab_miseq_projects", {
                        'name': project_name,
                        'max_variants': project_data['max_variants']
                    })
            project_version = session.post_json("/lab_miseq_project_versions",
                                                {
                                                    'pipeline_id': pipeline_id,
                                                    'project_id': project['id']
                                                })
            for i, region_data in enumerate(project_data['regions']):
                scoring_data = scoring_config['projects'][project_name][
                    'regions'][i]
                coordinate_region = regions[region_data['coordinate_region']]
                seed_region = regions[region_data['seed_region_names'][0]]
                seed_group_id = seed_region['seed_group_id']
                project_region = session.post_json(
                    "/lab_miseq_project_regions", {
                        'project_version_id': project_version['id'],
                        'coordinate_region_id': coordinate_region['id'],
                        'min_coverage1': scoring_data['min_coverage1'],
                        'min_coverage2': scoring_data['min_coverage2'],
                        'min_coverage3': scoring_data['min_coverage3'],
                        'seed_group_id': seed_group_id
                    })

                for key_position in scoring_data['key_positions']:
                    session.post_json(
                        "/lab_miseq_key_positions", {
                            'project_region_id': project_region['id'],
                            'start_pos': key_position['start_pos'],
                            'end_pos': key_position['end_pos']
                        })

    print("Done.")
def main():
    filename = 'HIV1_COM_2015_genome_DNA.csv'
    args = parse_args()

    if not os.path.exists(filename):
        form = {
            'ORGANISM': 'HIV',
            'ALIGN_TYPE': 'COM',
            'SUBORGANISM': 'HIV1',
            'PRE_USER': '******',
            'REGION': 'GENOME',
            'START': '',
            'END': '',
            'GENO_SUB': 'All',
            'BASETYPE': 'DNA',
            'YEAR': '2015',
            'FORMAT': 'csv',
            'submit': 'Get Alignment'
        }
        response = requests.post(
            "https://www.hiv.lanl.gov/cgi-bin/NEWALIGN/align.cgi", data=form)

        response.raise_for_status()
        # print(response.text)

        match = re.search(r'<pre>(.*)</pre>', response.text, re.DOTALL)
        with open(filename, 'w') as f:
            f.write(match.group(1))

    with qai_helper.Session() as session:
        session.login(args.qai_server, args.qai_user, args.qai_password)

        seed_groups = session.get_json("/lab_miseq_seed_groups")
        seed_group_name = 'HIV1-seed'
        for seed_group in seed_groups:
            if seed_group['name'] == seed_group_name:
                break
        else:
            raise RuntimeError(
                'Seed group {} not found.'.format(seed_group_name))
        old_regions = session.get_json("/lab_miseq_regions", retries=0)
        hiv_seeds = {
            region['name']: region
            for region in old_regions
            if region['seed_group_id'] == seed_group['id']
        }
        del old_regions

        clean_count = 0
        dirty_count = 0
        recombinant_names = []
        with open(filename, 'rU') as f:
            reader = csv.reader(f)
            for description, seed_seq in reader:
                seed_seq = seed_seq.replace('-', '')
                name_fields = description.split('.')
                subtype, country = name_fields[:2]
                accession = name_fields[-1]
                if subtype[0].isdigit():
                    recombinant_names.append(description)
                    continue
                seed_name = '-'.join(
                    ('HIV1', subtype, country, accession, 'seed'))

                groups = re.findall(r'([^ACGT]+)', seed_seq)
                if groups:
                    dirty_count += 1
                    print('Unexpected bases found in {}: {}'.format(
                        seed_name, ', '.join(groups)))
                else:
                    clean_count += 1
                old_region = hiv_seeds.pop(seed_name, None)
                if old_region:
                    old_seq = ''.join(old_region['reference'])
                    if old_seq != seed_seq:
                        print('expected: ' + seed_seq)
                        print('found:    ' + old_seq)
                        raise RuntimeError(
                            'Seed sequence {} does not match.'.format(
                                seed_name))
                elif len(seed_name) > 30:
                    print('Name too long: {!r}.'.format(seed_name))
                else:
                    session.post_json(
                        "/lab_miseq_regions", {
                            'name': seed_name,
                            'description': description,
                            'is_nucleotide': True,
                            'reference': seed_seq,
                            'seed_group_id': seed_group['id']
                        })
        if recombinant_names:
            print('Skipped recombinants: ' +
                  ', '.join(sorted(recombinant_names)))
        if hiv_seeds:
            seed_names = sorted(hiv_seeds.keys())
            should_delete = True
            print('Left over seeds:')
            if not should_delete:
                print(', '.join(seed_names))
            else:
                for seed_name in seed_names:
                    print(seed_name)
                    seed_id = hiv_seeds[seed_name]['id']
                    session.delete('{}/lab_miseq_regions/{}'.format(
                        args.qai_server, seed_id))

        print('Done with {} clean and {} dirty.'.format(
            clean_count, dirty_count))