Esempio n. 1
0
    def populateLocalCache(self):
        """
        iterates over document entries in self.datadict, retrievs documents
        from net if not already cached, extracts text from documents if not already cached
        """
        logger.info("Populating local document cache, retrieving from net as needed")
        for (k, d) in sorted(self.datadict.iteritems()):
            basename = utils.get_base_name(d["url"])

            fullpath = os.path.join(settings.DATADIR, basename)
            fulltxtpath = os.path.join(settings.DATADIR, basename.split(".")[0] + ".txt")
            if not os.path.exists(fulltxtpath) and not os.path.exists(fullpath):
                logger.info("Retrieving %s into %s" % (d["url"], settings.DATADIR))
                with open(fullpath, "wb") as f:
                    f.write(urllib.urlopen(d["url"]).read())
                    pass

            if not os.path.exists(fulltxtpath):
                cmd = "pdftotext -f 1 -l 5 %s -" % fullpath
                logger.info("converting %s to text" % fullpath)

                p = subprocess.Popen(cmd.strip().split(" "), stdout=subprocess.PIPE)
                (contents, errf) = p.communicate()
                with codecs.open(fulltxtpath, "wb", encoding="utf-8") as f:
                    f.write(contents.decode("utf-8"))

                if settings.DELETE_PDF_AFTER_EXTRACTION:
                    os.unlink(fullpath)
Esempio n. 2
0
def findNotMatched(datadict,matches):
    # save data of orphan documents separately, for forensics.
    not_matched = {x['url'] for x in datadict.values()}.difference({x['url'] for x in matches})
    not_matched = [x for x in datadict.values() if x['url'] in not_matched]
    for (i, v) in enumerate(not_matched):
        not_matched[i].update({'docid': utils.get_base_name(v['url'])})
    return not_matched
Esempio n. 3
0
    def populateLocalCache(self):
        """
        iterates over document entries in self.datadict, retrievs documents
        from net if not already cached, extracts text from documents if not already cached
        """
        logger.info("Populating local document cache, retrieving from net as needed")
        for (k, d) in sorted(self.datadict.iteritems()):
            basename = utils.get_base_name(d['url'])

            fullpath = os.path.join(settings.DATADIR, basename)
            fulltxtpath = os.path.join(settings.DATADIR, basename.split('.')[0] + ".txt")
            if not os.path.exists(fulltxtpath) and not os.path.exists(fullpath):
                logger.info("Retrieving %s into %s" % (d['url'], settings.DATADIR))
                with open(fullpath, "wb") as f:
                    f.write(urllib.urlopen(d['url']).read())
                    pass

            if not os.path.exists(fulltxtpath):
                cmd = "pdftotext -f 1 -l 5 %s -" % fullpath
                logger.info("converting %s to text" % fullpath)

                p = subprocess.Popen(cmd.strip().split(' '), stdout=subprocess.PIPE)
                (contents, errf) = p.communicate()
                with codecs.open(fulltxtpath, "wb", encoding='utf-8') as f:
                    f.write(contents.decode('utf-8'))

                if settings.DELETE_PDF_AFTER_EXTRACTION:
                    os.unlink(fullpath)
Esempio n. 4
0
def score(score_threshold, d):
    """ fuzzy match between all the records in identities """
    """ and all the lines present inside d['candidates'] """
    results = []
    for heading in d['candidates']:
        cand = [{
            'docid':
            utils.get_base_name(d['url']),
            'url':
            d['url'],
            'title':
            d['title'],
            'date':
            d['date'],
            'score':
            0 if len(heading) < 6 else fuzz.partial_ratio(entityName, heading),
            'entityName':
            entityName,
            'id':
            id,
            'heading':
            heading
        } for (entityName, id) in identities]

        results.append([x for x in cand if x['score'] > score_threshold])
    return results
Esempio n. 5
0
    def remove_duplicate_domains(self):
        """
		Filter out redundant domains, that is remove all but the first of the following domains:
		IPR001433	GO:0016491
		IPR001709	GO:0016491
		IPR001834	GO:0016491

		Parameters
		----------

		Returns
		-------
		pandas.DataFrame
			dataframe with domains with unique GO label
		"""
        print("Filtering out duplicate domains.")
        dom2go = read_csv(join(self.data_path, self.interpro2go_tab),
                          sep="\t",
                          header=0)
        num_dom = dom2go.shape[0]
        unique_domains_name = get_base_name(
            self.interpro2go_tab) + "_unique.tab"
        uniq_dom2go = dom2go.drop_duplicates(["GO_terms"])
        uniq_dom2go.to_csv(join(self.data_path, unique_domains_name),
                           sep="\t",
                           index=False)
        num_uniq_dom = uniq_dom2go.shape[0]
        print("Reducing domains from {} to {}.".format(num_dom, num_uniq_dom))
        return uniq_dom2go
Esempio n. 6
0
def create_random_set_multi(src_lang, trg_lang, work_dir, set_num):
    broad_samples_dir = os.path.join(work_dir, "broad-samples")
    #sample_sizes=utils.get_immediate_subdirectories(broad_samples_dir)
    sample_dirs = utils.sub_dir_path(broad_samples_dir)

    random_dir = os.path.join(work_dir, "random-sets")

    for sample_dir in sample_dirs:

        sample_size = utils.get_base_name(sample_dir)
        sample_out_dir = os.path.join(random_dir, sample_size)

        samples_name = utils.get_immediate_subfiles(sample_dir)
        name_list = Set()

        for sample in samples_name:
            name_list.add(sample.split('.')[0])
        for name in name_list:

            src_in_file = os.path.join(work_dir, "broad-samples", sample_size,
                                       ''.join([name, '.', src_lang]))
            trg_in_file = os.path.join(work_dir, "broad-samples", sample_size,
                                       ''.join([name, '.', trg_lang]))

            create_random_set_single(src_lang, trg_lang, src_in_file,
                                     trg_in_file, set_num, sample_out_dir)
Esempio n. 7
0
    def save_rand_comb(self, num_comb, uniq_dom2go):
        """
		Pick num_comb random combinations from the domains column of uniq_dom2go dataframe

		Parameters
		----------
		num_comb : int
			number of combination to pick
		uniq_dom2go : pandas.DataFrame
			dataframe of domains with unique GO terms

		Returns
		-------
		None
		"""
        num_uniq_dom = uniq_dom2go.shape[0]
        print("Pick {} random combinations of the {} domains and save them.".
              format(num_comb, num_uniq_dom))
        rand_combos = choose_combos(num_uniq_dom, 2, num_comb)
        # save dataframe with the combinations
        rand_comb_name = get_base_name(self.interpro2go_tab) + "_rand_comb.csv"
        with open(join(self.data_path, rand_comb_name), 'w') as rand_comb_file:
            combo_domains_header = [
                "interpro_id1", "interpro_id2", "gos_id1", "gos_id2"
            ]
            writer = csv.writer(rand_comb_file, delimiter=',')
            writer.writerow(combo_domains_header)
            for rand_combo in rand_combos:
                dom_combo = [
                    str(uniq_dom2go.iloc[rand_combo[0]].interpro_ids),
                    str(uniq_dom2go.iloc[rand_combo[1]].interpro_ids),
                    str(uniq_dom2go.iloc[rand_combo[0]].GO_terms),
                    str(uniq_dom2go.iloc[rand_combo[1]].GO_terms)
                ]
                writer.writerow(dom_combo)
Esempio n. 8
0
    def get_scene_file_name(self):
        """Get full file name without extensions.

        This name will be used to create folder name to where we save the
        outfile.
        """
        file_name = u.get_base_name(self.file_path).split('.')[0]

        return file_name
def bleu_intervals(in_dir, level, single_bleu_dir):
	st_builder=[]
	
	#test = np.array([9,1,3,4,8,7,2,5,6,0])
	
	for sample_dir in utils.sub_dir_path(in_dir):
		#print sample_dir
		
		st_builder.append(sample_dir)
		#test = np.array([9,1,3,4,8,7,2,5,6,0])
		#scores=np.zeros(1)
		bleu_arr=[]
		#min_score=100
		#max_score=0
		for file in utils.get_immediate_subfiles(sample_dir):
			#print file
			if file.endswith(".bleu"):				
				path=os.path.join(sample_dir, file)
				score=read_decoding_score(path)
				bleu=float(score)
				#np.append(scores, bleu)
				bleu_arr.append(bleu)
				
		#		if bleu<min_score:
		#			min_score=bleu
		#		if bleu>max_score:
		#			max_score=bleu
						
				#st_builder.append(score)
		scores=np.array(bleu_arr)
		interval=confidence_intervals(scores, level)
		#print scores
		st_builder.append("input scores")
		#st_builder.append(scores)
		#for score in scores:
		#	st_builder.append("%f "%(score))		
		
		#st_builder.append("\n")
		st_builder.append("min=%f max=%f"%(min(scores), max(scores)))
		
		st_builder.append("level: %d"%(level))
		st_builder.append("bleu intervals:")
		#st_builder.append(interval)
		#for score in interval:
		#	st_builder.append("%f "%(score))
		#st_builder.append("\n")
		st_builder.append("interval: %f %f"%(min(interval), max(interval)))
		#st_builder.append('\n')
		
		# single bleu score
		single_bleu_file_name=''.join(['sample-',utils.get_base_name(sample_dir),'.bleu'])
		single_bleu_path=os.path.join(single_bleu_dir, single_bleu_file_name)
		true_bleu=read_decoding_score(single_bleu_path)
		st_builder.append("true bleu: %s\n"%(true_bleu))
		
	return st_builder
Esempio n. 10
0
def findNotMatched(datadict, matches):
    # save data of orphan documents separately, for forensics.
    not_matched = {x['url']
                   for x in datadict.values()
                   }.difference({x['url']
                                 for x in matches})
    not_matched = [x for x in datadict.values() if x['url'] in not_matched]
    for (i, v) in enumerate(not_matched):
        not_matched[i].update({'docid': utils.get_base_name(v['url'])})
    return not_matched
Esempio n. 11
0
    def getDocumentLines(self, k):
        d = self.datadict[k]
        basename = utils.get_base_name(d["url"])
        fullpath = os.path.join(settings.DATADIR, basename)
        fulltxtpath = os.path.join(settings.DATADIR, basename.split(".")[0] + ".txt")
        logger.debug("Loading cached text for %s from %s" % (fullpath, fulltxtpath))
        with codecs.open(fulltxtpath, encoding="utf-8") as f:
            contents = f.read().encode("utf-8")

        lines = self.sanitize_lines(contents.split("\n"))
        return lines
Esempio n. 12
0
    def getDocumentLines(self, k):
        d = self.datadict[k]
        basename = utils.get_base_name(d['url'])
        fullpath = os.path.join(settings.DATADIR, basename)
        fulltxtpath = os.path.join(settings.DATADIR, basename.split('.')[0] + ".txt")
        logger.debug("Loading cached text for %s from %s" % (fullpath, fulltxtpath))
        with codecs.open(fulltxtpath, encoding='utf-8') as f:
            contents = f.read().encode('utf-8')

        lines = self.sanitize_lines(contents.split("\n"))
        return lines
Esempio n. 13
0
def score(score_threshold, d):
    """ fuzzy match between all the records in identities """
    """ and all the lines present inside d['candidates'] """
    results = []
    for heading in d['candidates']:
        cand = [{'docid': utils.get_base_name(d['url']),
                 'url': d['url'],
                 'title': d['title'],
                 'date': d['date'],
                 'score': 0 if len(heading) < 6 else fuzz.partial_ratio(entityName, heading),
                 'entityName': entityName,
                 'id': id,
                 'heading': heading}    for (entityName, id) in identities]

        results.append([x for x in cand if x['score'] > score_threshold])
    return results
Esempio n. 14
0
    def get_go_labels(self):
        """
		Get go labels for the whole domains2GO dataframe

		Parameters
		----------

		Returns
		-------
		None
		"""
        print("Get labels for GOs.")
        dom2go = read_csv(join(self.data_path, self.interpro2go_tab),
                          sep="\t",
                          header=0)
        dom2go_labels = dom2go.apply(self.extract_go_labels, axis=1)
        domains_with_labels = get_base_name(
            self.interpro2go_tab) + "_labels.csv"
        dom2go_labels.to_csv(join(self.data_path, domains_with_labels),
                             sep=",",
                             index=False)
Esempio n. 15
0
    def update_clients_from_server(self,
                                   sess,
                                   clients,
                                   update_vars_type=utils.VARS_TYPE_ALL):
        """Updates clients vars from server vars.

    Args:
      sess: TF Session.
      clients: A list of clients that will be updated from server.
      update_vars_type: String. Options: utils.VARS_TYPE_ALL means all vars,
        utils.VARS_TYPE_SHARED means shared vars.

    Raises:
      ValueError: Unknown update_vars_type.
    """
        if update_vars_type == utils.VARS_TYPE_ALL:
            server_vars = sess.run(self.server.read_ops_all_vars)
            client_update_ops = [c.update_ops_all for c in clients]

            client_update_ops_feed_dict = {}
            for c in clients:
                for var_base_name, placeholder in c.dict_update_placeholders.items(
                ):
                    client_update_ops_feed_dict[placeholder] = np.array(
                        [server_vars[var_base_name]])

        elif update_vars_type == utils.VARS_TYPE_SHARED:
            server_shared_vars = sess.run(self.server.read_ops_shared_vars)
            client_update_ops = [c.update_ops_shared for c in clients]
            client_update_ops_feed_dict = {}
            for c in clients:
                for shared_var in c.model_train.shared_vars:
                    var_base_name = utils.get_base_name(shared_var)
                    placeholder = c.dict_update_placeholders[var_base_name]
                    client_update_ops_feed_dict[placeholder] = np.array(
                        [server_shared_vars[var_base_name]])
        else:
            raise ValueError('Unknown vars update type: %s' % update_vars_type)

        sess.run(client_update_ops, feed_dict=client_update_ops_feed_dict)
Esempio n. 16
0
    def convert_to_tab(self, keep_only_MF):
        """
		Convert mapping of interpro to GOs into tabular file
		For each interpro domain in species file, read all GOs and arrange them as the column of the row

		Parameters
		----------
		keep_only_MF : bool
			keep only molecular function GO annotations (True), otherwise (False)

		Returns
		-------
		None
		"""
        print("Converting to tabs.")
        self.read_species_domains()
        interpro2go_tab = get_base_name(
            self.interpro2go
        ) + "_" + self.species_name + "_MF.tab" if keep_only_MF else ".tab"
        self.interpro2go_tab = interpro2go_tab
        num_written_lines = 0
        with open(self.interpro2go, 'r') as interpro2go_file, open(
                join(self.data_path, interpro2go_tab),
                'w') as interpro2go_tab_file:
            interpro2go_tab_file.write("interpro_ids\tGO_terms\n")
            previous_id = " "
            previous_go_terms = []
            for interpro2go_line in interpro2go_file:
                if interpro2go_line[0] != "!":
                    current_id = interpro2go_line.strip().split(
                        "InterPro:")[1].split(" ")[0]
                    assert current_id[:
                                      3] == "IPR", "AssertionError: interpro id must start with IPR.\n line: {}".format(
                                          interpro2go_line)
                    current_go_term = interpro2go_line.strip().split(" ; ")[-1]
                    if keep_only_MF and (current_go_term in self.go_db and
                                         self.go_db[current_go_term].namespace
                                         != "molecular_function"):
                        continue
                    if previous_id == " ":  # init
                        previous_go_terms.append(current_go_term)
                        previous_id = current_id
                    else:
                        if current_id == previous_id:  # still in the same interpro domain
                            previous_go_terms.append(current_go_term)
                        else:  # on another interpro domain
                            assert previous_id != " ", "AssertionError: id must not be null.\n line: {}".format(
                                interpro2go_line)
                            assert len(
                                previous_go_terms
                            ) > 0, "AssertionError: each interpro should have at least one GO.\n line:{}".format(
                                interpro2go_line)
                            if previous_id in self.species_domains_dict:
                                interpro2go_tab_file.write(
                                    previous_id + '\t' +
                                    " ".join(previous_go_terms) + "\n")
                                num_written_lines = num_written_lines + 1
                            previous_id = current_id
                            previous_go_terms = [current_go_term]

        print("Saved {} interpro2GO tabs in {}.".format(
            num_written_lines, interpro2go_tab))
Esempio n. 17
0
    def update_server_from_clients(self,
                                   sess,
                                   clients,
                                   update_vars_type=utils.VARS_TYPE_ALL):
        """Updates server vars to be the weighted average of client vars.

    Args:
      sess: TF Session.
      clients: A list of clients that will be used to update server.
      update_vars_type: String. Options: utils.VARS_TYPE_ALL means all vars,
        utils.VARS_TYPE_SHARED means shared vars.

    Raises:
      ValueError: Unknown update_vars_type.
    """
        num_clients = len(clients)
        total_num_batches = 0
        for c in clients:
            total_num_batches += c.model_train.data.num_batches
        # client_weights should sum to num_clients.
        client_weights = [
            float(c.model_train.data.num_batches * num_clients /
                  total_num_batches) for c in clients
        ]

        if update_vars_type == utils.VARS_TYPE_ALL:
            read_client_ops = collections.defaultdict(list)
            for var_base_name in self.server.model_train.var_dict:
                for c in clients:
                    read_client_ops[var_base_name].append(
                        c.read_ops_all_vars[var_base_name])

            client_vars = sess.run(read_client_ops)

            for cid, c in enumerate(clients):
                weight = client_weights[cid]
                for var_base_name in self.server.model_train.var_dict:
                    client_vars[var_base_name][cid] *= weight

            server_feed_dict = {}
            for (var_base_name,
                 placeholder) in self.server.dict_update_placeholders.items():
                client_vars_as_array = np.array(client_vars[var_base_name])
                server_feed_dict[placeholder] = client_vars_as_array

            sess.run(self.server.update_ops_all, feed_dict=server_feed_dict)

        elif update_vars_type == utils.VARS_TYPE_SHARED:
            read_client_ops = collections.defaultdict(list)
            for v in self.server.model_train.shared_vars:
                var_base_name = utils.get_base_name(v)
                for c in clients:
                    read_client_ops[var_base_name].append(
                        c.read_ops_shared_vars[var_base_name])
            client_vars = sess.run(read_client_ops)

            for cid, c in enumerate(clients):
                weight = client_weights[cid]
                for shared_var in self.server.model_train.shared_vars:
                    var_base_name = utils.get_base_name(shared_var)
                    client_vars[var_base_name][cid] *= weight

            server_feed_dict = {}
            for shared_var in self.server.model_train.shared_vars:
                var_base_name = utils.get_base_name(shared_var)
                client_vars_as_array = np.array(client_vars[var_base_name])
                placeholder = self.server.dict_update_placeholders[
                    var_base_name]
                server_feed_dict[placeholder] = client_vars_as_array

            sess.run(self.server.update_ops_shared, feed_dict=server_feed_dict)

        else:
            raise ValueError('Unknown vars update type: %s' % update_vars_type)
Esempio n. 18
0
def main():
    data = None

    ##################################
    # get stuff
    ##################################
    retriever = MetaDataRetriever(linksOutputFile=settings.LINKSFILE)
    data = retriever.scrape().save().getData()
    # dedupe, date filter and get back the data
    datadict = LinksProcessor(data, filterDate=settings.START_DATE).getData()

    # retrieve docs from net, convert them to text and cache the result
    docs = DocumentCache(datadict)

    datedict = {}

    ##################################
    # find stuff
    ##################################
    logger.info("Finding candidate lines...")

    for k in sorted(datadict.keys()):
        lines = docs.getDocumentLines(k)

        pat = [x for x in utils.mergeLines(lines, 3) if re.search(MAGIC_RE, x)]
        datepat = [
            x for x in utils.mergeLines(lines, 3) if re.search(DATE_RE, x)
        ]

        datadict[k]['candidates'] = pat
        datedict[k] = {'candidates': datepat}

    matchdict = makeMatches(datadict)

    # as a kludge , committees are stored with id COMMITEE_ID_BASE+offset in identities.json
    # so we can separate the matches into types
    mksMatchesCnt = len([
        x for x in matchdict.values()
        if int(x[0]['id']) < settings.COMMITEE_ID_BASE
    ])
    commMatchesCnt = len(matchdict) - mksMatchesCnt
    logger.info("Located %d unique matches with score > %d (%d: mks, %d: committee) " %\
                (len(matchdict), settings.SCORE_THRESHOLD, mksMatchesCnt, commMatchesCnt))

    ##################################
    # save stuff
    ##################################
    matches = reduce(lambda x, y: x + y, matchdict.values())
    not_matched = findNotMatched(datadict, matches)

    dump_json(not_matched, settings.NOMATCHESFILE)
    logger.info("saved details of documents with no matches as json in %s",
                settings.NOMATCHESFILE)

    dump_report(not_matched, settings.NO_MATCHES_HTML_FILE,
                settings.NO_MATCHES_TEMPLATE_FILE)
    dump_report(matches, settings.MATCHES_HTML_FILE,
                settings.MATCHES_TEMPLATE_FILE)

    cnt = 0
    logger.info("finding committee session dates")
    matchesDict = {x['docid']: x for x in matches}
    for (k, v) in datedict.iteritems():
        for line in v['candidates']:
            line = utils.reverse_nums(
                line)  # text extraction reverses numbers, RTL thing
            # munge and contort to extract a valid date
            d = extract_date(utils.get_base_name(k), line)
            if d and matchesDict.get(d['docid']):
                cnt += 1
                matchesDict[d['docid']]['comm_session_date'] = d[
                    'date'].strftime("%d/%m/%Y")

    logger.info("updated %d documents with a committee session date" % cnt)
    # use the updated dict containing comm_session_date
    # for matches
    matches = matchesDict.values()

    logger.info("saved matches as json in %s", settings.MATCHESFILE)
    dump_json(matches, settings.MATCHESFILE)

    logger.info("saved matches as csv in %s", settings.MATCHES_CSV_FILE)
    # saves matches as csv file
    g = filter_keys(data_to_gen(settings.MATCHESFILE))
    write_tsv(g, settings.MATCHES_CSV_FILE)

    # <-> short-circuit here to skip previous stages

    # load the matches back up
    with codecs.open(settings.MATCHESFILE, "r", encoding='utf-8') as f:
        matches = json.load(f)

    createRankings(matches)

    logger.info("saved rankings in %s", settings.COUNTS_CSVFILE)
    logger.info("Cheers.")
Esempio n. 19
0
    def __init__(self,
                 name,
                 data_generator,
                 model_class,
                 configs=None,
                 id_=-1,
                 initializer=None):
        self.name = name
        self.id = id_
        self.data = data_generator(configs=configs, agent_id=id_)

        with tf.name_scope(utils.get_train_name_scope(name)):
            train_data = self.data.train_data_batch
            model_train = model_class(name,
                                      is_training=True,
                                      data=train_data,
                                      config=configs.train_config,
                                      initializer=initializer)

        with tf.name_scope(utils.get_validation_name_scope(name)):
            valid_data = self.data.validation_data_batch
            model_validation = model_class(name,
                                           is_training=False,
                                           data=valid_data,
                                           reuse=True,
                                           config=configs.train_config,
                                           initializer=initializer)

        with tf.name_scope(utils.get_test_name_scope(name)):
            test_data = self.data.test_data_batch
            model_test = model_class(name,
                                     is_training=False,
                                     data=test_data,
                                     reuse=True,
                                     config=configs.eval_config,
                                     initializer=initializer)

        self.model_train = model_train
        self.model_validation = model_validation
        self.model_test = model_test

        with tf.name_scope(utils.get_update_name_scope(self.name)):
            # One could use any of the three models in this update name scope, since
            # the vars are shared among them.
            update_ops_shared, placeholders_shared = utils.generate_update_ops(
                self.model_train.shared_vars)
            update_ops_personal, placeholders_personal = utils.generate_update_ops(
                self.model_train.personal_vars)
            update_ops_all = update_ops_shared + update_ops_personal
            # Merges two dicts of placeholders. placeholders_shared and
            # placeholders_personal should have no overlap keys.
            assert not set(placeholders_shared.keys()).intersection(
                placeholders_personal.keys())
            dict_update_placeholders = {}
            dict_update_placeholders.update(placeholders_shared)
            dict_update_placeholders.update(placeholders_personal)

        self.update_ops_all = update_ops_all
        self.update_ops_shared = update_ops_shared
        self.dict_update_placeholders = dict_update_placeholders

        self.read_ops_all_vars = {
            k: v.value()
            for k, v in self.model_train.var_dict.items()
        }
        self.read_ops_shared_vars = {
            utils.get_base_name(v): v.value()
            for v in self.model_train.shared_vars
        }
Esempio n. 20
0
def main():
    data = None

    ##################################
    # get stuff
    ##################################
    retriever = MetaDataRetriever(linksOutputFile=settings.LINKSFILE)
    data=retriever.scrape().save().getData()
    # dedupe, date filter and get back the data
    datadict = LinksProcessor(data, filterDate=settings.START_DATE).getData()

    # retrieve docs from net, convert them to text and cache the result
    docs = DocumentCache(datadict)

    datedict = {}

    ##################################
    # find stuff
    ##################################
    logger.info("Finding candidate lines...")

    for k in sorted(datadict.keys()):
        lines = docs.getDocumentLines(k)

        pat = [x for x in utils.mergeLines(lines,3) if re.search(MAGIC_RE, x)]
        datepat = [x for x in utils.mergeLines(lines,3) if re.search(DATE_RE, x)]

        datadict[k]['candidates'] = pat
        datedict[k] = {'candidates': datepat}

    matchdict=makeMatches(datadict)

    # as a kludge , committees are stored with id COMMITEE_ID_BASE+offset in identities.json
    # so we can separate the matches into types
    mksMatchesCnt = len([x for x in matchdict.values() if int(x[0]['id']) < settings.COMMITEE_ID_BASE])
    commMatchesCnt = len(matchdict) - mksMatchesCnt
    logger.info("Located %d unique matches with score > %d (%d: mks, %d: committee) " %\
                (len(matchdict), settings.SCORE_THRESHOLD, mksMatchesCnt, commMatchesCnt))

    ##################################
    # save stuff
    ##################################
    matches = reduce(lambda x, y: x + y, matchdict.values())
    not_matched=findNotMatched(datadict,matches)

    dump_json(not_matched,settings.NOMATCHESFILE)
    logger.info("saved details of documents with no matches as json in %s", settings.NOMATCHESFILE)

    dump_report(not_matched,settings.NO_MATCHES_HTML_FILE,settings.NO_MATCHES_TEMPLATE_FILE)
    dump_report(matches,settings.MATCHES_HTML_FILE,settings.MATCHES_TEMPLATE_FILE)

    cnt=0
    logger.info("finding committee session dates")
    matchesDict = {x['docid']: x for x in matches}
    for (k, v) in datedict.iteritems():
        for line in v['candidates']:
            line = utils.reverse_nums(line) # text extraction reverses numbers, RTL thing
            # munge and contort to extract a valid date
            d = extract_date(utils.get_base_name(k), line)
            if d and matchesDict.get(d['docid']):
                cnt +=1
                matchesDict[d['docid']]['comm_session_date'] = d['date'].strftime("%d/%m/%Y")

    logger.info("updated %d documents with a committee session date" % cnt)
    # use the updated dict containing comm_session_date
    # for matches
    matches = matchesDict.values()

    logger.info("saved matches as json in %s", settings.MATCHESFILE)
    dump_json(matches,settings.MATCHESFILE)

    logger.info("saved matches as csv in %s", settings.MATCHES_CSV_FILE)
    # saves matches as csv file
    g=filter_keys(data_to_gen(settings.MATCHESFILE))
    write_tsv(g,settings.MATCHES_CSV_FILE)

    # <-> short-circuit here to skip previous stages

    # load the matches back up
    with codecs.open(settings.MATCHESFILE, "r", encoding='utf-8') as f:
        matches = json.load(f)

    createRankings(matches)

    logger.info("saved rankings in %s", settings.COUNTS_CSVFILE)
    logger.info("Cheers.")