def get_issues(): n_issues = 0 issues_by_case = defaultdict(list) meta_file = util_sc.init_metadata(data_dir) case_file_by_year = util_sc.get_case_from_metafile(meta_file, data_dir) for year in case_file_by_year: current_cases = case_file_by_year[year] #print current_cases #print len(current_cases) for case in current_cases: case_file_path = year + '/' + str(case) + '.xml' abs_case_file_path = data_dir + '/' + case_file_path tree = et.ElementTree(file=abs_case_file_path) case_issues = tree.find("issues").text.split(',') for i in xrange(0, len(case_issues)): issue = case_issues[i].strip().lower() # print "case issues %s" % issue.strip() if issue not in issue_id: issue_id[issue] = n_issues reverse_issue_id[n_issues] = issue n_issues += 1 issues_by_case[case].append(issue_id[issue]) #print len(issues_by_case) #print n_issues return issues_by_case, n_issues
def build_dataset(n_tokens, n_topics): meta_file = util_sc.init_metadata(data_dir) case_file_by_year = util_sc.get_case_from_metafile(meta_file, data_dir)