def get_space_by_year(data_path, start_year, end_year, threshold, level): space = {} space_citer = {} processed = Set() for y in range(start_year, end_year+1): space[y] = dict() space_citer[y] = dict() for dir_name, dir_names, file_names in os.walk(data_path): # print path to all file_names with extension files = [] for filename in file_names: if filename not in processed: file_path = os.path.join(dir_name, filename) file_type = os.path.splitext(file_path)[1] if file_type == '.xml': xml = etree.parse(file_path) if len(xml.xpath("//prism:coverDisplayDate", namespaces=ns)) > 0 and xml.xpath("//prism:coverDisplayDate", namespaces=ns)[0].text is not None: year = xml.xpath("//prism:coverDisplayDate", namespaces=ns)[0].text[-4:] elif len(xml.xpath("//prism:coverDate", namespaces=ns)) > 0: year = xml.xpath("//prism:coverDate", namespaces=ns)[0].text elif len(xml.xpath("//ce:copyright/@year", namespaces=ns)) > 0: year = xml.xpath("//ce:copyright/@year", namespaces=ns)[0] else: print("without data", file_path) continue if y == int(year): processed.add(filename) id = xml.xpath("//dct:identifier",namespaces=ns)[0].text space_citer[y][id] = citation.Citer(id, 0, 0, 0, 0) citing_set_list = cd.xml_to_citation_list_consys(xml, level) space_citer[y][id].units = citing_set_list for u in citing_set_list: for r in u: if r in space[y]: space[y][r] += 1 else: space[y][r] = 1 space[y] = OrderedDict(sorted(space[y].items(), key=lambda t: t[1], reverse=True)) num_nodes = len(space[y]) if threshold != 1: if threshold > 1: truncate = threshold else: truncate = int(threshold * num_nodes) space[y] =OrderedDict(itertools.islice(space[y].iteritems(), truncate)) items = space[y].items() truncate_value = items[-1][1] for i in range(truncate-1, 0, -1): if items[i][1] != truncate_value: truncate = i break space[y] = OrderedDict(itertools.islice(space[y].iteritems(), truncate)) del_keys = [] print("original citers:" + str(y) + "-" + str(len(space_citer[y]))) for c in space_citer[y]: del_keys_u = [] for u in xrange(len(space_citer[y][c].units)): space_citer[y][c].units[u] = set([elem for elem in space_citer[y][c].units[u] if elem in space[y]]) if len(space_citer[y][c].units[u]) < 2: del_keys_u.append(u) del_keys_u.reverse() for k in del_keys_u: del space_citer[y][c].units[k] if len(space_citer[y][c].units) == 0: del_keys.append(c) for k in del_keys: del space_citer[y][k] print("Year " + str(y) + "- nodes:" + str(len(space[y])) + ", space: " + str(num_nodes) + ", citer:" + str(len(space_citer[y]))) return [space, space_citer]
def get_space(data_path, start_year, end_year, threshold, level): space = dict() space_citer = dict() for dir_name, dir_names, file_names in os.walk(data_path): # print path to all file_names with extension files = [] for filename in file_names: file_path = os.path.join(dir_name, filename) file_type = os.path.splitext(file_path)[1] if file_type == '.xml': xml = etree.parse(file_path) if len(xml.xpath("//prism:coverDisplayDate", namespaces=ns)) > 0 and xml.xpath("//prism:coverDisplayDate", namespaces=ns)[0].text is not None: year = xml.xpath("//prism:coverDisplayDate", namespaces=ns)[0].text[-4:] elif len(xml.xpath("//prism:coverDate", namespaces=ns)) > 0: year = xml.xpath("//prism:coverDate", namespaces=ns)[0].text elif len(xml.xpath("//ce:copyright/@year", namespaces=ns)) > 0: year = xml.xpath("//ce:copyright/@year", namespaces=ns)[0] else: print("without data", file_path) continue id = xml.xpath("//dct:identifier",namespaces=ns)[0].text contexts = [] if len(xml.xpath("//ce:bib-reference[contains(.//sb:maintitle,'function')]/@id", namespaces=ns))>0: bib_id = str(xml.xpath("//ce:bib-reference[contains(.//sb:maintitle,'function')]/@id", namespaces=ns)[0]) cs = xml.xpath("//ce:cross-refs[contains(@refid,'"+bib_id+"')]//ancestor::s ", namespaces=ns) for c in cs: c_text = sentence_clean(c) if (c_text is not None) and (len(re.sub(" ", "", c_text)) > 40): contexts.append(citation.classes.Sentence(c_text)) abstracts = [] abstract_sentences = xml.xpath("//ce:abstract-sec//s", namespaces=ns) for s in abstract_sentences: s_text = sentence_clean(s) if (s_text is not None) and (len(re.sub(" ", "", s_text)) > 40): abstracts.append(citation.classes.Sentence(s_text)) if abstracts < 1: continue space_citer[id] = citation.classes.Citer(id, int(year), file_path, 0, 0, 0, 0) doi_xml = xml.xpath("//ce:doi", namespaces=ns) if len(doi_xml) > 0: space_citer[id].bib_id = str(doi_xml[0].text) month = str(doi_xml[0].text).split(".")[-2] if month.isdigit(): month = int(month) space_citer[id].month = month space_citer[id].path = file_path space_citer[id].abstract = abstracts space_citer[id].citation_context = contexts citing_set_list = cd.xml_to_citation_list_consys(xml, level) space_citer[id].orig_units = citing_set_list space_citer[id].units = copy.deepcopy(citing_set_list) if len(citing_set_list) == 0: del space_citer[id] print("del "+id) continue for u in citing_set_list: for r in u: if r in space: space[r] += 1 else: space[r] = 1 space = OrderedDict(sorted(space.items(), key=lambda t: t[1], reverse=True)) num_nodes = len(space) if threshold != 1: if threshold > 1: truncate = threshold else: truncate = int(threshold * num_nodes) space =OrderedDict(itertools.islice(space.iteritems(), truncate)) items = space.items() truncate_value = items[-1][1] for i in range(truncate-1, 0, -1): if items[i][1] != truncate_value: truncate = i break space = OrderedDict(itertools.islice(space.iteritems(), truncate)) del_keys = [] print("original citers:" + str(len(space_citer))) for c in space_citer: del_keys_u = [] for u in xrange(len(space_citer[c].units)): space_citer[c].units[u] = set([elem for elem in space_citer[c].units[u] if elem in space]) if len(space_citer[c].units[u]) < 2: del_keys_u.append(u) del_keys_u.reverse() for k in del_keys_u: del space_citer[c].units[k] if len(space_citer[c].units) == 0: del_keys.append(c) # for k in del_keys: # del space_citer[k] # space_citer = sorted(space_citer.values(), key=operator.attrgetter('year','month')) print("All: " + "- nodes:" + str(len(space)) + ", space: " + str(num_nodes) + ", citer:" + str(len(space_citer))) return [space, space_citer]