def all_remote_versions(self): """ Get all Versions from catalogue archive plus latest version. :return: list of variables of the type DataSourceVersion """ # get from release catalogue archive archive_path = posixpath.join(self.BASEURL, self.BASEPATH, 'archive') file_list = downloader.list_ftp_dir(archive_path) # get names of release catalogue files release_catalog_list = set( [x.name for x in file_list if 'RefSeq-release' in x.name]) # get release numbers from names # RefSeq-release20.catalog.gz => 20 version_numbers = [ int(x.split('-')[1].split('.')[0].replace('release', '')) for x in release_catalog_list ] # put in DataSourceVersion, add latest ds_versions = [DataSourceVersion(x) for x in version_numbers] ds_versions.append(self.latest_remote_version()) return ds_versions
def latest_remote_version(self): """ Versions are named with 'year_month', e.g. '2016_01'. Data is parsed from 'RELEASE.metalink' file in 'current_release' directory of FTP server. :return: Latest remote version. :rtype: DataSourceVersion """ current_release_path = posixpath.join(self.UNIPROT_BASEURL, self.UNIPROT_CURRENT_BASEPATH) release_file = posixpath.join(current_release_path, 'RELEASE.metalink') # read XML file from server into string release_metalink_xml = downloader.get_single_file_ftp( release_file).read().decode() # replace xml namespace thing to avoid dealing with namespaces release_metalink_xml = release_metalink_xml.replace( ' xmlns="', ' xmlnamespace="') # parse XML tree = ElementTree.fromstring(release_metalink_xml) version = tree.find('version').text return DataSourceVersion(self._date_from_name(version))
def latest_remote_version(self): """ Get number of latest release from ftp://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER. :return: DataSourceVersion of latest remote version """ # returns a BytesIO object release_file = downloader.get_single_file_ftp( 'ftp://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER') # to get the text simply 'read' and 'decode' release_number = int(release_file.read().decode().rstrip()) return DataSourceVersion(release_number)
def all_remote_versions(self): """ Get all versions available on miRBase FTP server. All versions are a directory in /pub/mirbase. :return: List of DataSourceVersions available on server. :rtype: list(DataSourceVersion) """ versions = [DataSourceVersion(x) for x in VERSION_2_URL] return versions
def all_remote_versions(self): """ Get all versions available on miRBase FTP server. All versions are a directory in /pub/mirbase. :return: List of DataSourceVersions available on server. :rtype: list(DataSourceVersion) """ ftp_list = downloader.list_ftp_dir(self.mirbase_baseurl, path=self.mirbase_basepath) versions = [DataSourceVersion(x.name) for x in ftp_list] return versions
def get_catalog_file_path(instance): """ Return the path to the Catalog file for a given instance. :param instance: The DataSource instance :return: The Catlog file path """ version = DataSourceVersion.version_from_string(instance.version) file_name = 'RefSeq-release{0}.catalog.gz'.format(version) file_path = os.path.join(instance.instance_dir, file_name) if os.path.exists(file_path): return file_path else: file_name = 'RefSeq-release{0}.catalog.filtered.gz'.format(version) file_path = os.path.join(instance.instance_dir, file_name) return file_path
def get_accession2geneid_file_path(instance): """ Return the path to the Catalog file for a given instance. :param instance: The DataSource instance :return: The Catlog file path """ version = DataSourceVersion.version_from_string(instance.version) file_name = 'release{0}.accession2geneid.gz'.format(version) file_path = os.path.join(instance.instance_dir, file_name) # try to return unfiltered if os.path.exists(file_path): return file_path else: file_name = 'release{0}.accession2geneid.filtered.gz'.format( version) file_path = os.path.join(instance.instance_dir, file_name) return file_path
def latest_remote_version(self): """ Only the latest version is accessible. """ return DataSourceVersion('03-2018')
def parse_xml(self): """ Parse descriptor XML file. """ mesh_instance = self.get_instance_by_name('Mesh') version = DataSourceVersion.version_from_string( mesh_instance.version ) descriptor_xml = mesh_instance.get_file('desc{}.xml'.format(str(version))) log.debug("XML file {}".format(descriptor_xml)) tree = ET.parse(descriptor_xml) root = tree.getroot() check_qualifier = set() check_concepts = set() check_terms = set() for descriptor_record in root.getchildren(): descriptor_ui = descriptor_record.find('DescriptorUI').text # <DescriptorName> # <String>Calcimycin</String> # </DescriptorName> descriptor_name = descriptor_record.find('.DescriptorName/String').text self.descriptor.add_node({'sid': descriptor_ui, 'name': descriptor_name}) # <AllowableQualifiersList> # <AllowableQualifier> # <QualifierReferredTo> # <QualifierUI>Q000302</QualifierUI> # <QualifierName> # <String>isolation & purification</String> # </QualifierName> # </QualifierReferredTo> # <Abbreviation>IP</Abbreviation> # </AllowableQualifier> # </AllowableQualifiersList> allowed_qualifiers = descriptor_record.findall( '.AllowableQualifiersList/AllowableQualifier/QualifierReferredTo') for qualifier in allowed_qualifiers: qualifier_ui = qualifier.find('.QualifierUI').text # add qualifier node id not exists if qualifier_ui not in check_qualifier: qualifier_name = qualifier.find('.QualifierName/String').text self.qualifier.add_node({'sid': qualifier_ui, 'name': qualifier_name}) check_qualifier.add(qualifier_ui) # add descriptor -> qualifier relationship self.descriptor_allowed_qualifier.add_relationship( {'sid': descriptor_ui}, {'sid': qualifier_ui}, {'source': 'mesh'} ) # <ConceptList> # <Concept PreferredConceptYN="Y"> # <ConceptUI>M0000001</ConceptUI> # <ConceptName> # <String>Calcimycin</String> # </ConceptName> # <CASN1Name>4-Benzoxazolecarboxylic acid, 5-(methylamino)-2-((3,9,11-trimethyl-8-(1-methyl-2-oxo-2-(1H-pyrrol-2-yl)ethyl)-1,7-dioxaspiro(5.5)undec-2-yl)methyl)-, (6S-(6alpha(2S*,3S*),8beta(R*),9beta,11alpha))-</CASN1Name> # <RegistryNumber>37H9VM9WZL</RegistryNumber> # <ScopeNote>An ionophorous, polyether antibiotic from Streptomyces chartreusensis. It binds and transports CALCIUM and other divalent cations across membranes and uncouples oxidative phosphorylation while inhibiting ATPase of rat liver mitochondria. The substance is used mostly as a biochemical tool to study the role of divalent cations in various biological systems. # </ScopeNote> # <RelatedRegistryNumberList> # <RelatedRegistryNumber>52665-69-7 (Calcimycin)</RelatedRegistryNumber> # </RelatedRegistryNumberList> # <ConceptRelationList> # <ConceptRelation RelationName="NRW"> # <Concept1UI>M0000001</Concept1UI> # <Concept2UI>M0353609</Concept2UI> # </ConceptRelation> # </ConceptRelationList> # <TermList> # <Term ConceptPreferredTermYN="Y" IsPermutedTermYN="N" LexicalTag="NON" RecordPreferredTermYN="Y"> # <TermUI>T000002</TermUI> # <String>Calcimycin</String> # <DateCreated> # <Year>1999</Year> # <Month>01</Month> # <Day>01</Day> # </DateCreated> # <ThesaurusIDlist> # <ThesaurusID>FDA SRS (2014)</ThesaurusID> # <ThesaurusID>NLM (1975)</ThesaurusID> # </ThesaurusIDlist> # </Term> # </TermList> # </Concept> concepts = descriptor_record.findall('.ConceptList/Concept') for concept in concepts: preferred_concept = concept.attrib['PreferredConceptYN'] concept_ui = concept.find('.ConceptUI').text # concept node if not exists if concept_ui not in check_concepts: concept_properties = {} concept_properties['sid'] = concept_ui concept_properties['name'] = concept.find('.ConceptName/String').text try: concept_properties['scope_note'] = concept.find('.ScopeNote').text except AttributeError as e: pass self.concept.add_node(concept_properties) check_concepts.add(concept_ui) # (Descriptor)--(Concept) relation self.descriptor_has_concept.add_relationship({'sid': descriptor_ui}, {'sid': concept_ui}, {'preferred': preferred_concept}) # concept relations for concept_relation in concept.findall('.ConceptRelationList/ConceptRelation'): left = concept_relation.find('.Concept1UI').text right = concept_relation.find('.Concept2UI').text name = concept_relation.attrib['RelationName'] self.concept_related_concept.add_relationship({'sid': left}, {'sid': right}, {'name': name}) # iterate Terms for concept for term in concept.findall('.TermList/Term'): term_ui = term.find('TermUI').text concept_preferred_term = term.attrib['ConceptPreferredTermYN'] # Term node if not exists if term_ui not in check_terms: term_name = term.find('.String').text self.term.add_node({'sid': term_ui, 'name': term_name}) check_terms.add(term_ui) # (Concept)--(Term) self.concept_has_term.add_relationship({'sid': concept_ui}, {'sid': term_ui}, {'preferred': concept_preferred_term})