def test_clear_sentences9(self): obj_a = Argument('coca-cola light') obj_b = Argument('pepsi light') s = [ Sentence('Coca Cola™ light tastes better than Pepsi™ light', 20, '', '') ] s = clear_sentences(s, obj_a, obj_b) self.assertTrue(s)
def test_extract_main_links3(self): sentences_a = [ Sentence( 'Coca-Cola™ tastes better than pepsi, because of its ingredients', 20, '', '') ] sentences_b = [ Sentence( 'Pepsi is worse than Coca-cola™, because of the better sweeteners', 20, '', '') ] obj_a = Argument('coca-cola') obj_b = Argument('pepsi light') self.assertEqual( extract_main_links(sentences_a, sentences_b, obj_a, obj_b), { 'A': ['ingredients'], 'B': ['sweeteners'] })
def extract_sentences(es_json, aggregate_duplicates=True): ''' Extracts the sentences from an Elastic Search commoncrawl2 json result. (This is the default and can be changed in constants.py) es_json: Dictionary the JSON object resulting from Elastic Search commoncrawl2 ''' try: hits = es_json.json()['hits']['hits'] except KeyError: return [] sentences = [] seen_sentences = set() for hit in hits: source = hit['_source'] text = source['text'] document_id = source['document_id'] if 'document_id' in source else '' sentence_id = source['sentence_id'] if 'sentence_id' in source else '' if prepare_sentence_comparison(text) in seen_sentences: if aggregate_duplicates: for x in sentences: if prepare_sentence_comparison( x.text) == prepare_sentence_comparison(text): if document_id not in x.id_pair: x.add_id_pair(document_id, sentence_id) elif document_id in x.id_pair and x.id_pair[ document_id] > sentence_id: x.id_pair[document_id] = sentence_id break else: seen_sentences.add(prepare_sentence_comparison(text)) sentences.append( Sentence(text, hit['_score'], document_id, sentence_id)) return sentences
def test_clear_sentences1(self): s = [Sentence('Dog is worse than cat?', 20, '', '')] s = clear_sentences(s, self.objA, self.objB) self.assertFalse(s)