Ejemplo n.º 1
0
    def all_remote_versions(self):
        """
        Get all Versions from catalogue archive plus latest version.
        :return: list of variables of the type DataSourceVersion
        """

        # get from release catalogue archive
        archive_path = posixpath.join(self.BASEURL, self.BASEPATH, 'archive')

        file_list = downloader.list_ftp_dir(archive_path)
        # get names of release catalogue files
        release_catalog_list = set(
            [x.name for x in file_list if 'RefSeq-release' in x.name])

        # get release numbers from names
        # RefSeq-release20.catalog.gz => 20
        version_numbers = [
            int(x.split('-')[1].split('.')[0].replace('release', ''))
            for x in release_catalog_list
        ]

        # put in DataSourceVersion, add latest
        ds_versions = [DataSourceVersion(x) for x in version_numbers]
        ds_versions.append(self.latest_remote_version())

        return ds_versions
Ejemplo n.º 2
0
    def latest_remote_version(self):
        """
        Versions are named with 'year_month', e.g. '2016_01'.
        Data is parsed from 'RELEASE.metalink' file in 'current_release' directory of FTP server.

        :return: Latest remote version.
        :rtype: DataSourceVersion
        """
        current_release_path = posixpath.join(self.UNIPROT_BASEURL,
                                              self.UNIPROT_CURRENT_BASEPATH)

        release_file = posixpath.join(current_release_path, 'RELEASE.metalink')

        # read XML file from server into string
        release_metalink_xml = downloader.get_single_file_ftp(
            release_file).read().decode()

        # replace xml namespace thing to avoid dealing with namespaces
        release_metalink_xml = release_metalink_xml.replace(
            ' xmlns="', ' xmlnamespace="')

        # parse XML
        tree = ElementTree.fromstring(release_metalink_xml)
        version = tree.find('version').text

        return DataSourceVersion(self._date_from_name(version))
Ejemplo n.º 3
0
 def latest_remote_version(self):
     """
     Get number of latest release from ftp://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER.
     :return: DataSourceVersion of latest remote version
     """
     # returns a BytesIO object
     release_file = downloader.get_single_file_ftp(
         'ftp://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER')
     # to get the text simply 'read' and 'decode'
     release_number = int(release_file.read().decode().rstrip())
     return DataSourceVersion(release_number)
Ejemplo n.º 4
0
    def all_remote_versions(self):
        """
        Get all versions available on miRBase FTP server.

        All versions are a directory in /pub/mirbase.

        :return: List of DataSourceVersions available on server.
        :rtype: list(DataSourceVersion)
        """
        versions = [DataSourceVersion(x) for x in VERSION_2_URL]
        return versions
Ejemplo n.º 5
0
    def all_remote_versions(self):
        """
        Get all versions available on miRBase FTP server.

        All versions are a directory in /pub/mirbase.

        :return: List of DataSourceVersions available on server.
        :rtype: list(DataSourceVersion)
        """
        ftp_list = downloader.list_ftp_dir(self.mirbase_baseurl,
                                           path=self.mirbase_basepath)
        versions = [DataSourceVersion(x.name) for x in ftp_list]

        return versions
Ejemplo n.º 6
0
    def get_catalog_file_path(instance):
        """
        Return the path to the Catalog file for a given instance.

        :param instance: The DataSource instance
        :return: The Catlog file path
        """
        version = DataSourceVersion.version_from_string(instance.version)

        file_name = 'RefSeq-release{0}.catalog.gz'.format(version)
        file_path = os.path.join(instance.instance_dir, file_name)
        if os.path.exists(file_path):
            return file_path
        else:
            file_name = 'RefSeq-release{0}.catalog.filtered.gz'.format(version)
            file_path = os.path.join(instance.instance_dir, file_name)
            return file_path
Ejemplo n.º 7
0
    def get_accession2geneid_file_path(instance):
        """
        Return the path to the Catalog file for a given instance.

        :param instance: The DataSource instance
        :return: The Catlog file path
        """
        version = DataSourceVersion.version_from_string(instance.version)

        file_name = 'release{0}.accession2geneid.gz'.format(version)
        file_path = os.path.join(instance.instance_dir, file_name)
        # try to return unfiltered
        if os.path.exists(file_path):
            return file_path
        else:
            file_name = 'release{0}.accession2geneid.filtered.gz'.format(
                version)
            file_path = os.path.join(instance.instance_dir, file_name)
            return file_path
Ejemplo n.º 8
0
 def latest_remote_version(self):
     """
     Only the latest version is accessible.
     """
     return DataSourceVersion('03-2018')
Ejemplo n.º 9
0
    def parse_xml(self):
        """
        Parse descriptor XML file.
        """
        mesh_instance = self.get_instance_by_name('Mesh')

        version = DataSourceVersion.version_from_string(
            mesh_instance.version
        )

        descriptor_xml = mesh_instance.get_file('desc{}.xml'.format(str(version)))
        log.debug("XML file {}".format(descriptor_xml))

        tree = ET.parse(descriptor_xml)
        root = tree.getroot()

        check_qualifier = set()
        check_concepts = set()
        check_terms = set()

        for descriptor_record in root.getchildren():
            descriptor_ui = descriptor_record.find('DescriptorUI').text

            # <DescriptorName>
            #  <String>Calcimycin</String>
            # </DescriptorName>
            descriptor_name = descriptor_record.find('.DescriptorName/String').text

            self.descriptor.add_node({'sid': descriptor_ui, 'name': descriptor_name})

            #   <AllowableQualifiersList>
            #   <AllowableQualifier>
            #    <QualifierReferredTo>
            #     <QualifierUI>Q000302</QualifierUI>
            #      <QualifierName>
            #      <String>isolation &amp; purification</String>
            #      </QualifierName>
            #    </QualifierReferredTo>
            #    <Abbreviation>IP</Abbreviation>
            #   </AllowableQualifier>
            #   </AllowableQualifiersList>

            allowed_qualifiers = descriptor_record.findall(
                '.AllowableQualifiersList/AllowableQualifier/QualifierReferredTo')
            for qualifier in allowed_qualifiers:
                qualifier_ui = qualifier.find('.QualifierUI').text

                # add qualifier node id not exists
                if qualifier_ui not in check_qualifier:
                    qualifier_name = qualifier.find('.QualifierName/String').text
                    self.qualifier.add_node({'sid': qualifier_ui, 'name': qualifier_name})
                    check_qualifier.add(qualifier_ui)

                # add descriptor -> qualifier relationship
                self.descriptor_allowed_qualifier.add_relationship(
                    {'sid': descriptor_ui}, {'sid': qualifier_ui}, {'source': 'mesh'}
                )

            #  <ConceptList>
            #    <Concept PreferredConceptYN="Y">
            #     <ConceptUI>M0000001</ConceptUI>
            #     <ConceptName>
            #      <String>Calcimycin</String>
            #     </ConceptName>
            #     <CASN1Name>4-Benzoxazolecarboxylic acid, 5-(methylamino)-2-((3,9,11-trimethyl-8-(1-methyl-2-oxo-2-(1H-pyrrol-2-yl)ethyl)-1,7-dioxaspiro(5.5)undec-2-yl)methyl)-, (6S-(6alpha(2S*,3S*),8beta(R*),9beta,11alpha))-</CASN1Name>
            #     <RegistryNumber>37H9VM9WZL</RegistryNumber>
            #     <ScopeNote>An ionophorous, polyether antibiotic from Streptomyces chartreusensis. It binds and transports CALCIUM and other divalent cations across membranes and uncouples oxidative phosphorylation while inhibiting ATPase of rat liver mitochondria. The substance is used mostly as a biochemical tool to study the role of divalent cations in various biological systems.
            #     </ScopeNote>
            #     <RelatedRegistryNumberList>
            #      <RelatedRegistryNumber>52665-69-7 (Calcimycin)</RelatedRegistryNumber>
            #     </RelatedRegistryNumberList>
            #     <ConceptRelationList>
            #      <ConceptRelation RelationName="NRW">
            #      <Concept1UI>M0000001</Concept1UI>
            #      <Concept2UI>M0353609</Concept2UI>
            #      </ConceptRelation>
            #     </ConceptRelationList>
            #     <TermList>
            #      <Term  ConceptPreferredTermYN="Y"  IsPermutedTermYN="N"  LexicalTag="NON"  RecordPreferredTermYN="Y">
            #       <TermUI>T000002</TermUI>
            #       <String>Calcimycin</String>
            #       <DateCreated>
            #        <Year>1999</Year>
            #        <Month>01</Month>
            #        <Day>01</Day>
            #       </DateCreated>
            #       <ThesaurusIDlist>
            #        <ThesaurusID>FDA SRS (2014)</ThesaurusID>
            #        <ThesaurusID>NLM (1975)</ThesaurusID>
            #       </ThesaurusIDlist>
            #      </Term>
            #     </TermList>
            #    </Concept>

            concepts = descriptor_record.findall('.ConceptList/Concept')

            for concept in concepts:
                preferred_concept = concept.attrib['PreferredConceptYN']

                concept_ui = concept.find('.ConceptUI').text

                # concept node if not exists
                if concept_ui not in check_concepts:
                    concept_properties = {}
                    concept_properties['sid'] = concept_ui
                    concept_properties['name'] = concept.find('.ConceptName/String').text

                    try:
                        concept_properties['scope_note'] = concept.find('.ScopeNote').text
                    except AttributeError as e:
                        pass

                    self.concept.add_node(concept_properties)

                    check_concepts.add(concept_ui)

                # (Descriptor)--(Concept) relation
                self.descriptor_has_concept.add_relationship({'sid': descriptor_ui}, {'sid': concept_ui},
                                                             {'preferred': preferred_concept})

                # concept relations
                for concept_relation in concept.findall('.ConceptRelationList/ConceptRelation'):
                    left = concept_relation.find('.Concept1UI').text
                    right = concept_relation.find('.Concept2UI').text
                    name = concept_relation.attrib['RelationName']

                    self.concept_related_concept.add_relationship({'sid': left}, {'sid': right}, {'name': name})

                # iterate Terms for concept
                for term in concept.findall('.TermList/Term'):
                    term_ui = term.find('TermUI').text
                    concept_preferred_term = term.attrib['ConceptPreferredTermYN']

                    # Term node if not exists
                    if term_ui not in check_terms:
                        term_name = term.find('.String').text
                        self.term.add_node({'sid': term_ui, 'name': term_name})

                        check_terms.add(term_ui)

                    # (Concept)--(Term)
                    self.concept_has_term.add_relationship({'sid': concept_ui}, {'sid': term_ui},
                                                           {'preferred': concept_preferred_term})