def resource_bowtie2_log(uri, **kwargs): with open(uri) as fh: data = "".join(fh) index = ['Number of Reads', 'Number Unpaired', 'Number Unaligned', 'Number Uniquely Aligned', 'Number Ambiguously Aligned'] values = [] for row in data.strip().split('\n'): if not row.startswith('Warn'): values.append(re.sub(r' *(\d+)[ %]+.+', r'\1', row)) df = DataFrame([int(x) for x in values[:-1]], index=index) df.index.name = 'statistic' df.columns = ['counts'] return df
def resource_cutadapt_metrics(uri, **kwargs): with open(uri) as fh: data = "".join(fh) sections = re.split("\n===.*===\n", data) df = DataFrame.from_records([_split_x(x) for x in sections[1].split("\n") if x], index=["statistic"], columns=["statistic", "value"]) df["value"] = pd.to_numeric(df["value"]) return df
def _reader(uri): with open(uri) as fh: data = [x.strip("\n").split("\t") for x in fh if not x.strip() == ""] indices = list((i for i, val in enumerate(data) if val[0].startswith("## METRICS CLASS"))) metrics = DataFrame.from_records(data[(indices[0] + 2):], columns=data[(indices[0] + 1)], index="CATEGORY") return (metrics, None)
def _hist_reader(uri): with open(uri) as fh: data = [x.strip("\n").split("\t") for x in fh if not x.strip() == ""] indices = list((i for i, val in enumerate(data) if val[0].startswith("## METRICS CLASS") or val[0].startswith("## HISTOGRAM"))) if len(indices) == 1: indices.append(len(data)) metrics = DataFrame.from_records(data[(indices[0] + 2):(indices[1])], columns=data[(indices[0] + 1)]) # We could be missing the histogram try: hist = DataFrame.from_records(data[(indices[1] + 2):], columns=data[(indices[1] + 1)]) except: logger.warn("No histogram data for {}".format(uri)) hist = None return (metrics, hist)
def resource_genome_results(uri, key="Globals", **kwargs): with open(uri) as fh: data = "".join(fh) sections = re.split(">+\s+[a-zA-Z ]+", data) section_names = ["Header"] + [re.sub(" ", "_", x) for x in re.findall(">+\s+([a-zA-Z ]+)", data)] d = dict() for h, sec in zip(section_names, sections): if h == "Coverage_per_contig": d[h] = DataFrame.from_records([re.split("\s+", x.strip()) for x in sec.split("\n") if x], columns=COVERAGE_PER_CONTIG_COLUMNS, index="chr") d[h] = d[h].apply(pd.to_numeric) elif h in ["Coverage", "Header"]: pass else: d[h] = DataFrame.from_records([_split_x(x) for x in sec.split("\n") if x], columns=["statistic", "value"], index="statistic") if not h in ["Input"]: d[h] = d[h].apply(pd.to_numeric) return d[key]
# In[2]: client = MongoClient() db = client.nytimes3 # In[3]: total = db.articles.count() percent = math.ceil(total / 100) count = 0 print('Total documents:', total) print() total_rows_df = DataFrame() try: for doc in db.articles.find(): rows_df = DataFrame() try: # Texts common_texts = [] if doc.get('abstract'): common_texts.append(doc['abstract']) if doc.get('headline') and isinstance(doc['headline'], dict) and doc['headline'].get('main'): common_texts.append(doc['headline']['main']) if doc.get('lead_paragraph'): common_texts.append(doc['lead_paragraph']) # add snippet as variable field