def __init__(self): communicator = HALCommunicator(cfg.query('biosamples', 'aap_url'), cfg.query('biosamples', 'bsd_url'), cfg.query('biosamples', 'username'), cfg.query('biosamples', 'password')) self.submitter = BSDSubmitter(communicator, cfg.query('biosamples', 'domain'))
def check_submitted_variant_flanks(mongo_client, ssid): samtools = cfg.query('executable', 'samtools', ret_default='samtools') sve_collection = mongo_client['eva_accession_sharded']['dbsnpSubmittedVariantEntity'] cursor = sve_collection.find({'accession': int(ssid), 'remappedFrom': {'$exists': False}}) flank_size = 50 variant_records = list(cursor) id_2_info = {} for variant_rec in variant_records: flank_up_coord = f"{variant_rec['contig']}:{variant_rec['start'] - flank_size}-{variant_rec['start'] - 1}" flank_down_coord = f"{variant_rec['contig']}:{variant_rec['start'] + 1}-{variant_rec['start'] + flank_size}" genome_assembly_fasta = get_genome(assembly_accession=variant_rec['seq'], taxonomy=variant_rec['tax']) command = f"{samtools} faidx {genome_assembly_fasta} {flank_up_coord} | grep -v '^>' | sed 's/\\n//' " flank_up = run_command_with_output(f'Extract upstream sequence using {flank_up_coord}', command, return_process_output=True).strip().upper() command = f"{samtools} faidx {genome_assembly_fasta} {flank_down_coord} | grep -v '^>' | sed 's/\\n//' " flank_down = run_command_with_output(f'Extract downstream sequence using {flank_down_coord}', command, return_process_output=True).strip().upper() id_2_info[variant_rec['_id']] = {'variant_rec': variant_rec, 'flank_up': flank_up, 'flank_down': flank_down} for variant_id1, variant_id2 in list(itertools.combinations(id_2_info, 2)): alignment, strand = compare_variant_flanks( id_2_info[variant_id1]['flank_up'] + id_2_info[variant_id1]['variant_rec']['ref'] + id_2_info[variant_id1]['flank_down'], id_2_info[variant_id2]['flank_up'] + id_2_info[variant_id2]['variant_rec']['ref'] + id_2_info[variant_id2]['flank_down'] ) output = format_output( ssid, id_2_info[variant_id1]['variant_rec'], id_2_info[variant_id2]['variant_rec'], alignment, strand, id_2_info[variant_id1]['flank_up'], id_2_info[variant_id1]['flank_down'], id_2_info[variant_id2]['flank_up'], id_2_info[variant_id2]['flank_down'] ) print(output)
def upload_vcf_files_to_ena_ftp(self, files_to_upload): self.info('Connect to %s', cfg.query('ena', 'ftphost')) ftps = HackFTP_TLS() host = cfg.query('ena', 'ftphost') ftps.connect(host, port=int(cfg.query('ena', 'ftpport', ret_default=21))) ftps.login(cfg.query('ena', 'username'), cfg.query('ena', 'password')) ftps.prot_p() if self.eload not in ftps.nlst(): self.info('Create %s directory' % self.eload) ftps.mkd(self.eload) ftps.cwd(self.eload) for file_to_upload in files_to_upload: file_name = os.path.basename(file_to_upload) self.info('Upload %s to FTP' % file_name) with open(file_to_upload, 'rb') as open_file: ftps.storbinary('STOR %s' % file_name, open_file)
def upload_xml_files_to_ena(self, submission_file, project_file, analysis_file): response = requests.post( cfg.query('ena', 'submit_url'), auth=HTTPBasicAuth(cfg.query('ena', 'username'), cfg.query('ena', 'password')), files=dict(SUBMISSION=(os.path.basename(submission_file), get_file_content(submission_file), 'application/xml'), PROJECT=(os.path.basename(project_file), get_file_content(project_file), 'application/xml'), ANALYSIS=(os.path.basename(analysis_file), get_file_content(analysis_file), 'application/xml'))) self.results['receipt'] = response.text self.results.update(self.parse_ena_receipt(response.text)) if self.results['errors']: self.error('\n'.join(self.results['errors']))
def get_hold_date_from_ena(project_accession, project_alias=None): """Gets hold date from ENA""" if not project_alias: project_alias = get_project_alias(project_accession) xml_request = f'''<SUBMISSION_SET> <SUBMISSION> <ACTIONS> <ACTION> <RECEIPT target="{project_alias}"/> </ACTION> </ACTIONS> </SUBMISSION> </SUBMISSION_SET>''' response = requests.post(cfg.query('ena', 'submit_url'), auth=HTTPBasicAuth(cfg.query('ena', 'username'), cfg.query('ena', 'password')), files={'SUBMISSION': xml_request}) receipt = ET.fromstring(response.text) hold_date = None try: hold_date = receipt.findall('PROJECT')[0].attrib['holdUntilDate'] hold_date = datetime.strptime(hold_date.replace(':', ''), '%Y-%m-%d%z') except (IndexError, KeyError): # if there's no hold date, assume it's already been made public xml_root = download_xml_from_ena( f'https://www.ebi.ac.uk/ena/browser/api/xml/{project_accession}') attributes = xml_root.xpath( '/PROJECT_SET/PROJECT/PROJECT_ATTRIBUTES/PROJECT_ATTRIBUTE') for attr in attributes: if attr.findall('TAG')[0].text == 'ENA-FIRST-PUBLIC': hold_date = attr.findall('VALUE')[0].text hold_date = datetime.strptime(hold_date, '%Y-%m-%d') break if not hold_date: raise ValueError( f"Couldn't get hold date from ENA for {project_accession} ({project_alias})" ) return hold_date
def get_genome_fasta_and_report(species_name, assembly_accession, output_directory=None, overwrite=False): output_directory = output_directory or cfg.query('genome_downloader', 'output_directory') assembly = NCBIAssembly(assembly_accession, species_name, output_directory, eutils_api_key=cfg['eutils_api_key']) if not os.path.isfile(assembly.assembly_fasta_path) or not os.path.isfile( assembly.assembly_report_path) or overwrite: assembly.download_or_construct(overwrite=overwrite) return assembly.assembly_fasta_path, assembly.assembly_report_path
def upload_xml_files_to_ena(self, submission_file, project_file, analysis_file): file_dict = { 'SUBMISSION': (os.path.basename(submission_file), get_file_content(submission_file), 'application/xml'), 'ANALYSIS': (os.path.basename(analysis_file), get_file_content(analysis_file), 'application/xml') } # If we are uploading to an existing project the project_file is not set if project_file: file_dict['PROJECT'] = (os.path.basename(project_file), get_file_content(project_file), 'application/xml') response = requests.post(cfg.query('ena', 'submit_url'), auth=HTTPBasicAuth( cfg.query('ena', 'username'), cfg.query('ena', 'password')), files=file_dict) self.results['receipt'] = response.text self.results.update(self.parse_ena_receipt(response.text)) if self.results['errors']: self.error('\n'.join(self.results['errors']))
def get_reference_fasta_and_report(species_name, reference_accession, output_directory=None, overwrite=False): output_directory = output_directory or cfg.query('genome_downloader', 'output_directory') if NCBIAssembly.is_assembly_accession_format(reference_accession): assembly = NCBIAssembly( reference_accession, species_name, output_directory, eutils_api_key=cfg['eutils_api_key'] ) if not os.path.isfile(assembly.assembly_fasta_path) or not os.path.isfile(assembly.assembly_report_path) or overwrite: assembly.download_or_construct(overwrite=overwrite) return assembly.assembly_fasta_path, assembly.assembly_report_path elif NCBISequence.is_genbank_accession_format(reference_accession): reference = NCBISequence(reference_accession, species_name, output_directory, eutils_api_key=cfg['eutils_api_key']) if not os.path.isfile(reference.sequence_fasta_path) or overwrite: reference.download_contig_sequence_from_ncbi(genbank_only=True) return reference.sequence_fasta_path, None else: logger.warning(f'{reference_accession} is not recognize as either an INSDC assembly or sequence.')
def get_genome(taxonomy, assembly_accession): return os.path.join( cfg.query('genome_downloader', 'output_directory'), get_scientific_name_from_taxonomy(taxonomy).lower().replace(' ', '_'), assembly_accession, assembly_accession + '.fa' )