Example #1
0
    def scrub(self):
        """
        The var_citations file has a bad row in it with > 6 cols.  I will comment these out.

        :return:
        """
        # awk  -F"\t" '{if (NF <= 6) print $1, $2, $3, $4, $5, $6 ; OFS = "\t"}' variant_citations.txt
        f = '/'.join((self.rawdir, self.files['variant_citations']['file']))
        logger.info('removing the line that has too many cols (^15091)')
        pysed.replace("^15091", '#15091', f)

        return
Example #2
0
    def scrub(self):
        """
        The var_citations file has a bad row in it with > 6 cols.
        I will comment these out.

        :return:

        """
        # awk  -F"\t" 'BEFIN{OFS = "\t"}NF==6{print}' variant_citations.txt
        f = '/'.join((self.rawdir, self.files['variant_citations']['file']))
        logger.info('removing the line that has too many cols (^15091)')
        pysed.replace("^15091", '#15091', f)

        return
Example #3
0
    def scrub(self):
        """
        Perform various data-scrubbing on the raw data files prior to parsing.
        For this resource, this currently includes:
        * revise errors in identifiers for some OMIM and PMIDs
        :return: None
        """
        # scrub file of the oddities...lots of publication rewriting
        f = '/'.join((self.rawdir, self.files['annot']['file']))
        logger.info('scrubbing PubMed:12345 --> PMID:12345')
        pysed.replace("PubMed", 'PMID', f)

        logger.info('scrubbing pmid:12345 --> PMID:12345')
        pysed.replace("pmid", 'PMID', f)

        logger.info('scrubbing PMID12345 --> PMID:12345')
        pysed.replace("PMID([0-9][0-9]*)", 'PMID:\\1', f)

        logger.info('scrubbing MIM12345 --> OMIM:12345')
        pysed.replace('MIM([0-9][0-9]*)', 'OMIM:\\1', f)

        logger.info('scrubbing MIM:12345 --> OMIM:12345')
        pysed.replace(";MIM", ";OMIM", f)

        logger.info('scrubbing ORPHANET --> Orphanet')
        pysed.replace("ORPHANET", "Orphanet", f)
        return
Example #4
0
    def scrub(self):
        """
        Perform various data-scrubbing on the raw data files prior to parsing.
        For this resource, this currently includes:
        * revise errors in identifiers for some OMIM and PMIDs

        :return: None

        """

        # scrub file of the oddities...lots of publication rewriting
        f = '/'.join((self.rawdir, self.files['annot']['file']))
        logger.info('scrubbing PubMed:12345 --> PMID:12345')
        pysed.replace(r'PubMed:', 'PMID:', f)

        logger.info('scrubbing pmid:12345 --> PMID:12345')
        pysed.replace(r'pmid:', 'PMID:', f)

        logger.info('scrubbing PMID:    12345 --> PMID:12345')
        pysed.replace(r'PMID:  *', 'PMID:', f)

        logger.info('scrubbing PMID12345 --> PMID:12345')
        pysed.replace(r'PMID([0-9][0-9]*)', r'PMID:\1', f)

        logger.info('scrubbing MIM12345 --> OMIM:12345')
        pysed.replace(r'MIM([0-9][0-9]*)', r'OMIM:\1', f)

        logger.info('scrubbing MIM:12345 --> OMIM:12345')
        pysed.replace(r";MIM", ";OMIM", f)

        logger.info('scrubbing ORPHANET --> Orphanet')
        pysed.replace("ORPHANET", "Orphanet", f)

        logger.info('scrubbing ORPHA --> Orphanet')
        pysed.replace("ORPHA", "Orphanet", f)
        return