def conflict(soup): conflict_tags = extract_nodes(soup, "fn", attr="fn-type", value="conflict") conflict_tags += extract_nodes(soup, "fn", attr="fn-type", value="COI-statement") return conflict_tags
def abstract(soup, abstract_type=None): if abstract_type: return extract_nodes(soup, "abstract", attr="abstract-type", value=abstract_type) else: return extract_nodes(soup, "abstract")
def custom_meta(soup, meta_name=None): custom_meta_tags = extract_nodes(soup, "custom-meta") if meta_name is not None: custom_meta_tags = filter( lambda tag: node_contents_str( first(extract_nodes(tag, "meta-name"))) == meta_name, custom_meta_tags) return custom_meta_tags
def authors(soup, contrib_type="author"): if contrib_type: return extract_nodes(soup, "contrib", attr="contrib-type", value=contrib_type) else: return extract_nodes(soup, "contrib")
def fn_group(soup, content_type=None): if content_type: return extract_nodes(soup, "fn-group", attr="content-type", value=content_type) else: return extract_nodes(soup, "fn-group")
def pub_id(soup, pub_id_type=None): if pub_id_type: return extract_nodes(soup, "pub-id", attr="pub-id-type", value=pub_id_type) else: return extract_nodes(soup, "pub-id")
def ext_link(soup, ext_link_type=None): if ext_link_type: return extract_nodes(soup, "ext-link", attr="ext-link-type", value=ext_link_type) else: return extract_nodes(soup, "ext-link")
def journal_id(soup): # the first non-nil tag return firstnn( extract_nodes(soup, "journal-id", attr="journal-id-type", value="publisher-id"))
def research_organism_keywords(soup): tags = first( extract_nodes(soup, "kwd-group", attr="kwd-group-type", value="research-organism")) if not tags: return None return filter(lambda tag: tag.name == "kwd", tags) or None
def author_keywords(soup): # A few articles have kwd-group with no kwd-group-type, so account for those tags = extract_nodes(soup, "kwd-group") keyword_tags = [] for tag in tags: if (tag.get("kwd-group-type") == "author-keywords" or tag.get("kwd-group-type") is None): keyword_tags += filter(lambda tag: tag.name == "kwd", tag) return keyword_tags
def article_contributors(soup): article_meta_tag = article_meta(soup) if article_meta_tag: contributor_tags = extract_nodes(article_meta_tag, ["contrib", "on-behalf-of"]) return filter(lambda tag: tag.parent.name == "contrib-group", contributor_tags) else: return None
def full_subject_area(soup, subject_group_type=None): subject_group_tags = extract_nodes(soup, "subj-group") subject_group_tags = filter(lambda tag: tag.parent.name == "article-categories" and tag.parent.parent.name == "article-meta", subject_group_tags) if subject_group_type: subject_group_tags = filter(lambda tag: tag.get("subj-group-type" == subject_group_type)) return subject_group_tags
def subject_area(soup, subject_group_type = None): # Supports all subject areas or just particular ones filtered by subject_area_tags = [] tags = extract_nodes(soup, "subject") subject_area_tags = filter(lambda tag: tag.parent.name == "subj-group" \ and tag.parent.parent.name == "article-categories" \ and tag.parent.parent.parent.name == "article-meta", tags) if subject_group_type: subject_area_tags = filter(lambda tag: tag.parent.get("subj-group-type") == subject_group_type, tags) return subject_area_tags
def full_subject_area(soup, subject_group_type=None): subject_group_tags = extract_nodes(soup, "subj-group") subject_group_tags = filter( lambda tag: tag.parent.name == "article-categories" and tag.parent. parent.name == "article-meta", subject_group_tags) if subject_group_type: subject_group_tags = filter( lambda tag: tag.get("subj-group-type" == subject_group_type)) return subject_group_tags
def subject_area(soup, subject_group_type=None): # Supports all subject areas or just particular ones filtered by subject_area_tags = [] tags = extract_nodes(soup, "subject") subject_area_tags = filter(lambda tag: tag.parent.name == "subj-group" \ and tag.parent.parent.name == "article-categories" and tag.parent.parent.parent.name == "article-meta", tags) if subject_group_type: subject_area_tags = filter( lambda tag: tag.parent.get("subj-group-type") == subject_group_type, tags) return subject_area_tags
def simulate(vbmap): pylab.figure() nodes = extract_nodes(vbmap['map']) nodes_count = len(nodes) vbmaps = simulate_failovers(vbmap) charts_count = len(vbmaps) rows = cols = int(math.ceil(math.sqrt(charts_count))) def plot(vbmap, chart): pylab.subplot(rows, cols, chart) masters = [n for n in extract_masters(vbmap) if n is not None] pylab.xticks([i + 0.5 for i in xrange(nodes_count)], nodes) pylab.hist(masters, bins=xrange(nodes_count + 1)) pylab.xlabel("Nodes") pylab.ylabel("Number of vbuckets") pylab.legend() for chart, vbmap in enumerate(vbmaps, 1): plot(vbmap, chart)
def award_group(soup): return extract_nodes(soup, "award-group")
def list_item(soup): return extract_nodes(soup, "list-item")
def fig_group(soup): return extract_nodes(soup, "fig-group")
def year(soup): return first(extract_nodes(soup, "year"))
def publisher_id(soup): article_id_tags = extract_nodes(soup, "article-id", attr = "pub-id-type", value = "publisher-id") # the first article-id tag whose parent is article-meta return first(filter(lambda tag: tag.parent.name == "article-meta", article_id_tags))
def month(soup): return first(extract_nodes(soup, "month"))
def history_date(soup, date_type): date_tags = extract_nodes(soup, "date", attr="date-type", value=date_type) return first(filter(lambda tag: tag.parent.name == "history", date_tags))
def copyright_year(soup): return first(extract_nodes(permissions(soup), "copyright-year"))
def copyright_statement(soup): return first(extract_nodes(permissions(soup), "copyright-statement"))
def article_title(soup): return first(extract_nodes(soup, "article-title"))
def licence_p(soup): return first(extract_nodes(licence(soup), "license-p"))
def licence(soup): return first(extract_nodes(permissions(soup), "license"))
def permissions(soup): # a better selector might be "article-meta.permissions" return first(extract_nodes(soup, "permissions"))
def conflict(soup): return extract_nodes(soup, "fn", attr = "fn-type", value = "conflict")
def string_name(soup): return extract_nodes(soup, "string-name")
def acknowledgements(soup): return first(extract_nodes(soup, "ack"))
def copyright_holder(soup): return first(extract_nodes(permissions(soup), "copyright-holder"))
def funding_statement(soup): return first(extract_nodes(soup, "funding-statement"))
def research_organism_keywords(soup): tags = first(extract_nodes(soup, "kwd-group", attr = "kwd-group-type", value = "research-organism")) if not tags: return None return filter(lambda tag: tag.name == "kwd", tags) or None
def journal_id(soup): # the first non-nil tag return firstnn(extract_nodes(soup, "journal-id", attr = "journal-id-type", value = "hwp"))
def history_date(soup, date_type): date_tags = extract_nodes(soup, "date", attr = "date-type", value = date_type) return first(filter(lambda tag: tag.parent.name == "history", date_tags))
def doi(soup): doi_tags = extract_nodes(soup, "article-id", attr = "pub-id-type", value = "doi") # the first article-id tag whose parent is article-meta return first(filter(lambda tag: tag.parent.name == "article-meta", doi_tags))
def pub_date_collection(soup, pub_type): return first(extract_nodes(soup, "pub-date", attr = "pub-type", value = pub_type))
def fig(soup): return extract_nodes(soup, "fig")
def pub_date(soup, date_type): return first(extract_nodes(soup, "pub-date", attr = "date-type", value = date_type))
def list(soup): return extract_nodes(soup, "list")
def journal_issn(soup, pub_format): return first( extract_nodes(soup, "issn", attr="publication-format", value=pub_format))
def funding_group(soup): return extract_nodes(soup, "funding-group")
def article_type(soup): # returns raw data, just that the data doesn't contain any BS nodes return first(extract_nodes(soup, "article")).get('article-type')
def principal_award_recipient(soup): return extract_nodes(soup, "principal-award-recipient")
def pub_date(soup, date_type): return first( extract_nodes(soup, "pub-date", attr="date-type", value=date_type))
def journal_title(soup): return first(extract_nodes(soup, "journal-title"))
def keyword_group(soup): return extract_nodes(soup, "kwd-group")
def publisher(soup): return first(extract_nodes(soup, "publisher-name"))
def article_meta(soup): return first(extract_nodes(soup, "article-meta"))
def pub_date_collection(soup, pub_type): return first( extract_nodes(soup, "pub-date", attr="pub-type", value=pub_type))
def journal_issn(soup, pub_format): return first(extract_nodes(soup, "issn", attr = "publication-format", value = pub_format))
def day(soup): return first(extract_nodes(soup, "day"))
def main(): vbmap = load_vbmap(sys.argv[1]) masters = extract_masters(vbmap['map']) replicas = extract_replicas(vbmap['map']) nodes = extract_nodes(vbmap['map']) nodes_count = len(nodes) nodes_dict = dict((n, i) for i, n in enumerate(nodes)) tags = extract_tags(vbmap, nodes) tags_list = sorted(set(tags.values())) tags_count = len(tags_list) pylab.figure() pylab.subplot(211) pylab.xticks([i + 0.5 for i in xrange(nodes_count)], nodes) plots = [hist(masters, nodes_dict)] + \ [hist(r, nodes_dict) for r in replicas] labels = ['master'] + ['replica %d' % i for i in xrange(len(replicas))] pylab.hist(plots, bins=xrange(nodes_count + 1), label=labels) pylab.title("Number of vbuckets per node") pylab.xlabel("Nodes") pylab.ylabel("Number of vbuckets") pylab.legend() pylab.subplot(212) pylab.xticks([i + 0.5 for i in xrange(nodes_count)], nodes) all_replicas = list(chain(*replicas)) pylab.hist(hist(all_replicas, nodes_dict), bins=xrange(nodes_count + 1), label='all replicas', rwidth=0.5) pylab.title("Number of replica vbuckets per node") pylab.xlabel("Nodes") pylab.ylabel("Number of vbuckets") pylab.legend() pylab.figure() pylab.subplot(211) plots = [[tags[n] for n in masters]] + \ [[tags[n] for n in r] for r in replicas] pylab.hist(plots, bins=xrange(tags_count + 1), label=labels) pylab.xticks([i + 0.5 for i in xrange(tags_count)], tags_list) pylab.title("Number of vbuckets per tag") pylab.xlabel("Tags") pylab.ylabel("Number of vbuckets") pylab.legend() pylab.subplot(212) pylab.xticks([i + 0.5 for i in xrange(nodes_count)], nodes) pylab.title("Number of nodes each node replicates to per tag") pylab.xlabel("Nodes") pylab.ylabel("Number of replica nodes") tags_repcounts = tag_replication_counts(vbmap['map'], nodes, tags_list, tags) plots = [] for tag_counts in tags_repcounts: plot = [] for node, count in enumerate(tag_counts): plot.extend([node] * count) plots.append(plot) pylab.hist(plots, bins=xrange(nodes_count + 1), label=map(str, tags)) pylab.legend() simulate(vbmap) pylab.show()