def create(self): docs = os.listdir(self.doc_dir) index = {} for doc in docs: file_name = os.path.join(self.doc_dir, doc) if file_name.endswith('.txt'): doc_name = doc.replace('CACM-', '').replace('.txt', '') doc_text = read_file(file_name).strip().split() terms = set(doc_text) for term in terms: inv_list = [] entry = list() entry.append(doc_name) entry.append(doc_text.count(term)) positions = [ i for i, x in enumerate(doc_text) if x == term ] entry = entry + positions if term in index.keys(): inv_list = index[term] inv_list = sorted(inv_list, key=itemgetter(1)) inv_list.append(entry) index[term] = inv_list self.index = index
def scores(self, query): scores = {} docs = os.listdir(self.doc_dir) corpus_len = len(docs) for term in query.split(): if term in self.index.keys(): if self.mode == 1: if term in self.stoplist: continue inv_list = self.index[term] df = len(inv_list) / corpus_len for entry in inv_list: doc_id = entry[0] doc_name = 'CACM-' + doc_id + '.txt' doc_text = read_file(os.path.join(self.doc_dir, doc_name)) doc_len = len(doc_text) tf = entry[1] / doc_len score = tf * math.log(1 / df) scores[doc_id] = scores[ doc_id] + score if doc_id in scores else score return scores
def __init__(self, mode): paths = get_model_paths(mode) self.mode = mode self.doc_dir = paths['doc_dir'] self.index = file_to_dict(paths['index_file']) self.dlens = {doc.replace('CACM-', '').replace('.txt', ''): len(read_file(os.path.join(self.doc_dir, doc)).split()) for doc in os.listdir(self.doc_dir)} self.clen = sum(self.dlens.values()) self.stoplist = get_stoplist()
def __init__(self, model): config = load_config() data_dir = config.get('DIRS', 'data_dir') stopwords_file = abspath(data_dir, config.get('FILES', 'common_words')) corpus_dir = config.get('DIRS', 'corpus_dir') self.stopwords = read_file(stopwords_file).split('\n') self.parsed_dir = abspath(corpus_dir, config.get('DIRS', 'parsed_dir')) self.model = model
def get_freq_terms(self, doc_id): doc_path = os.path.join(self.parsed_dir, 'CACM-' + doc_id + '.txt') terms = read_file(doc_path).split() most_common = Counter(terms).most_common() freq_terms = [ word_tuple for word_tuple in most_common if not self.is_stop_word(word_tuple[0]) ] return freq_terms[:3]
def get_run(self): run = {} run_text = read_file(self.results_file_path) run_text = run_text.replace('\n\n', '\n') for line in run_text.split('\n')[:-1]: data = line.split() query_id = data[0] doc_id = data[2] run.setdefault(query_id, []).append(doc_id) return run
def get_queries(self): queries = {} self.data_parser.initialize() self.data_parser.feed(read_file(self.query_file)) qdata = self.data_parser.get_data() i = 3 while i < len(qdata): queries[int(qdata[i].strip())] = parse_stuff(qdata[i + 2]) i += 8 return queries
def get_freq_terms(self, doc): data = read_file(os.path.join(self.parsed_dir, 'CACM-' + doc + '.txt')) words = data.split() doc_len = len(words) / 15 word_freq = Counter(words) sig_words = [] if doc_len < 25: threshold = 7 - 0.1 * (25 - doc_len) elif 25 <= doc_len <= 40: threshold = 7 else: threshold = 7 + 0.1 * (doc_len - 40) for word in word_freq.keys(): if word_freq[word] >= threshold: sig_words.append(word) return sig_words
def get_snippet(self): parsed_sentences = {} org_sentences = {} for item in self.doc_scores: doc = item[0] content = read_file( os.path.join(self.raw_docs, 'CACM-' + doc + '.html')) self.dataparser.initialize() self.dataparser.feed(content) data = [] dataparser_op = self.dataparser.get_data() end = len(dataparser_op) for i in range(end): if dataparser_op[i] == 'pre': end = i if end > 3: org = " ".join(dataparser_op[3:end]).split('\n\n') else: org = dataparser_op[3].split('\n\n') self.titles[doc] = org[1] org_data = org[2:-2] # parsing document for line in org_data: data.append(parse_stuff(line, period=True)) parsed_sentences[doc] = [line.split('.') for line in data] org_sentences[doc] = [line.split('.') for line in org_data] sig_words = self.significant_words.union( set(self.get_freq_terms(doc))) for portion_index in range(len(parsed_sentences[doc])): portion = parsed_sentences[doc][portion_index] for sent_index in range(len(portion)): sent = portion[sent_index] # print("Doc = {}, Data = {}".format(doc, sent)) if sent: words = sent.strip().split() first_sig = None last_sig = 0 sig_count = 0 non_sig_count = 0 max_non_sig = 20 for i in range(len(words)): if words[i] in sig_words: if first_sig is None: first_sig = i sig_count += 1 elif non_sig_count <= max_non_sig: last_sig = i sig_count += 1 elif first_sig is not None and non_sig_count <= max_non_sig: non_sig_count += 1 cts = Counter( [sig_words.__contains__(word) for word in words]) if first_sig is not None and last_sig > first_sig: sig_factor = sig_count**2 / (last_sig - first_sig) elif cts[True]: sig_factor = cts[True] / max_non_sig else: sig_factor = 0 self.snippets.setdefault(doc, []).append( (org_sentences[doc][portion_index] [sent_index].strip(), sig_factor)) for doc in self.snippets.keys(): self.snippets[doc] = sorted(self.snippets[doc], key=lambda k: k[1], reverse=True)