def scrub(self): """ The var_citations file has a bad row in it with > 6 cols. I will comment these out. :return: """ # awk -F"\t" '{if (NF <= 6) print $1, $2, $3, $4, $5, $6 ; OFS = "\t"}' variant_citations.txt f = '/'.join((self.rawdir, self.files['variant_citations']['file'])) logger.info('removing the line that has too many cols (^15091)') pysed.replace("^15091", '#15091', f) return
def scrub(self): """ The var_citations file has a bad row in it with > 6 cols. I will comment these out. :return: """ # awk -F"\t" 'BEFIN{OFS = "\t"}NF==6{print}' variant_citations.txt f = '/'.join((self.rawdir, self.files['variant_citations']['file'])) logger.info('removing the line that has too many cols (^15091)') pysed.replace("^15091", '#15091', f) return
def scrub(self): """ Perform various data-scrubbing on the raw data files prior to parsing. For this resource, this currently includes: * revise errors in identifiers for some OMIM and PMIDs :return: None """ # scrub file of the oddities...lots of publication rewriting f = '/'.join((self.rawdir, self.files['annot']['file'])) logger.info('scrubbing PubMed:12345 --> PMID:12345') pysed.replace("PubMed", 'PMID', f) logger.info('scrubbing pmid:12345 --> PMID:12345') pysed.replace("pmid", 'PMID', f) logger.info('scrubbing PMID12345 --> PMID:12345') pysed.replace("PMID([0-9][0-9]*)", 'PMID:\\1', f) logger.info('scrubbing MIM12345 --> OMIM:12345') pysed.replace('MIM([0-9][0-9]*)', 'OMIM:\\1', f) logger.info('scrubbing MIM:12345 --> OMIM:12345') pysed.replace(";MIM", ";OMIM", f) logger.info('scrubbing ORPHANET --> Orphanet') pysed.replace("ORPHANET", "Orphanet", f) return
def scrub(self): """ Perform various data-scrubbing on the raw data files prior to parsing. For this resource, this currently includes: * revise errors in identifiers for some OMIM and PMIDs :return: None """ # scrub file of the oddities...lots of publication rewriting f = '/'.join((self.rawdir, self.files['annot']['file'])) logger.info('scrubbing PubMed:12345 --> PMID:12345') pysed.replace(r'PubMed:', 'PMID:', f) logger.info('scrubbing pmid:12345 --> PMID:12345') pysed.replace(r'pmid:', 'PMID:', f) logger.info('scrubbing PMID: 12345 --> PMID:12345') pysed.replace(r'PMID: *', 'PMID:', f) logger.info('scrubbing PMID12345 --> PMID:12345') pysed.replace(r'PMID([0-9][0-9]*)', r'PMID:\1', f) logger.info('scrubbing MIM12345 --> OMIM:12345') pysed.replace(r'MIM([0-9][0-9]*)', r'OMIM:\1', f) logger.info('scrubbing MIM:12345 --> OMIM:12345') pysed.replace(r";MIM", ";OMIM", f) logger.info('scrubbing ORPHANET --> Orphanet') pysed.replace("ORPHANET", "Orphanet", f) logger.info('scrubbing ORPHA --> Orphanet') pysed.replace("ORPHA", "Orphanet", f) return