コード例 #1
0
 def test_clear_sentences9(self):
     obj_a = Argument('coca-cola light')
     obj_b = Argument('pepsi light')
     s = [
         Sentence('Coca Cola™ light tastes better than Pepsi™ light', 20,
                  '', '')
     ]
     s = clear_sentences(s, obj_a, obj_b)
     self.assertTrue(s)
コード例 #2
0
 def test_extract_main_links3(self):
     sentences_a = [
         Sentence(
             'Coca-Cola™ tastes better than pepsi, because of its ingredients',
             20, '', '')
     ]
     sentences_b = [
         Sentence(
             'Pepsi is worse than Coca-cola™, because of the better sweeteners',
             20, '', '')
     ]
     obj_a = Argument('coca-cola')
     obj_b = Argument('pepsi light')
     self.assertEqual(
         extract_main_links(sentences_a, sentences_b, obj_a, obj_b), {
             'A': ['ingredients'],
             'B': ['sweeteners']
         })
コード例 #3
0
ファイル: es_requester.py プロジェクト: sayankotor/cqas
def extract_sentences(es_json, aggregate_duplicates=True):
    '''
    Extracts the sentences from an Elastic Search commoncrawl2 json result. (This is the default
    and can be changed in constants.py)

    es_json:    Dictionary
                the JSON object resulting from Elastic Search commoncrawl2
    '''
    try:
        hits = es_json.json()['hits']['hits']
    except KeyError:
        return []
    sentences = []
    seen_sentences = set()
    for hit in hits:
        source = hit['_source']
        text = source['text']
        document_id = source['document_id'] if 'document_id' in source else ''
        sentence_id = source['sentence_id'] if 'sentence_id' in source else ''

        if prepare_sentence_comparison(text) in seen_sentences:
            if aggregate_duplicates:
                for x in sentences:
                    if prepare_sentence_comparison(
                            x.text) == prepare_sentence_comparison(text):
                        if document_id not in x.id_pair:
                            x.add_id_pair(document_id, sentence_id)
                        elif document_id in x.id_pair and x.id_pair[
                                document_id] > sentence_id:
                            x.id_pair[document_id] = sentence_id
                        break
        else:
            seen_sentences.add(prepare_sentence_comparison(text))
            sentences.append(
                Sentence(text, hit['_score'], document_id, sentence_id))

    return sentences
コード例 #4
0
 def test_clear_sentences1(self):
     s = [Sentence('Dog is worse than cat?', 20, '', '')]
     s = clear_sentences(s, self.objA, self.objB)
     self.assertFalse(s)