class SemEval2014Task4_Laptops(__SemEval2014Task4):
    """ SemEval 2014 Task 4 Laptop Dataset for Aspect based Sentiment Analysis.
        Download: http://alt.qcri.org/semeval2014/task4/index.php?id=data-and-tools
    """

    # paths
    TRAIN_FILE = FilePath(
        "SemEval2014-Task4/Laptops_Train.xml",
        "https://raw.githubusercontent.com/pedrobalage/SemevalAspectBasedSentimentAnalysis/master/semeval_data/Laptop_Train_v2.xml"
    )
    TEST_FILE = FilePath(
        "SemEval2014-Task4/laptops-trial.xml",
        "https://alt.qcri.org/semeval2014/task4/data/uploads/laptops-trial.xml"
    )

    n_train_items = lambda self: 3045
    n_eval_items = lambda self: 100

    def get_aspect_label_pairs(self, sentence):
        # get aspect categories and terms
        aspect_terms = sentence.find('aspectTerms')
        # load aspect label pairs
        aspect_label_pairs = []
        if aspect_terms is not None:
            aspect_label_pairs += [(aspect.attrib['term'],
                                    aspect.attrib['polarity'])
                                   for aspect in aspect_terms]
        # return
        return aspect_label_pairs
class SemEval2014Task4_Category(__SemEval2014Task4):
    """ SemEval 2014 Task 4 Aspect-Category Dataset for Aspect based Sentiment Analysis.
        Only provides examples for aspect-categories (not explicitly mentioned in the text).
        Download: http://alt.qcri.org/semeval2014/task4/index.php?id=data-and-tools
    """

    # paths
    TRAIN_FILE = FilePath(
        "SemEval2014-Task4/Restaurants_Train.xml",
        "https://raw.githubusercontent.com/pedrobalage/SemevalAspectBasedSentimentAnalysis/master/semeval_data/Restaurants_Train_v2.xml"
    )
    TEST_FILE = FilePath(
        "SemEval2014-Task4/restaurants-trial.xml",
        "https://alt.qcri.org/semeval2014/task4/data/uploads/restaurants-trial.xml"
    )

    n_train_items = lambda self: 3041
    n_eval_items = lambda self: 100

    def get_aspect_label_pairs(self, sentence):
        # only get categories
        aspect_categories = sentence.find('aspectCategories')
        # build aspect label pairs
        aspect_label_pairs = []
        if aspect_categories is not None:
            aspect_label_pairs += [
                (aspect.attrib['category'], aspect.attrib['polarity'])
                for aspect in sentence.find('aspectCategories')
                if aspect is not None
            ]
        # return
        return aspect_label_pairs
class SemEval2015Task12(AOEx_Dataset):
    """ SemEval 2015 Task 12 dataset for Aspect-Opinion Extraction
        Downlaod: https://github.com/happywwy/Coupled-Multi-layer-Attentions/tree/master/util/data_semEval
    """    

    n_train_items = lambda self: 1316
    n_eval_items = lambda self: 686
    
    def yield_items(self, aspect_fname:str, sent_opinion_fname:str):
        # build full paths to files
        aspect_fpath = self.data_base_dir / aspect_fname
        sent_opinion_fpath = self.data_base_dir / sent_opinion_fname

        # load file contents
        with open(aspect_fpath, 'r', encoding='utf-8') as f:
            all_aspects = f.read().replace('NULL', '').split('\n')
        with open(sent_opinion_fpath, 'r', encoding='utf-8') as f:
            all_sents_opinions = f.read().split('\n')
        assert len(all_aspects) == len(all_sents_opinions)

        # preprocess data
        for sent_opinions, aspects in zip(all_sents_opinions, all_aspects):
            # separate sentence from opinions
            sent, opinions = sent_opinions.split('##') if '##' in sent_opinions else (sent_opinions, '')
            # get aspects and opinions
            opinions = [o.strip()[:-3] for o in opinions.split(',')] if len(opinions) > 0 else []
            aspects = [a.strip() for a in aspects.split(',')] if len(aspects) > 0 else []
            # build aspect and opinion spans
            opinion_pos = [sent.find(o) for o in opinions]
            opinion_spans = [(i, i + len(o)) for i, o in zip(opinion_pos, opinions)]
            aspect_pos = [sent.find(a) for a in aspects]
            aspect_spans = [(i, i + len(a)) for i, a in zip(aspect_pos, aspects)]
            # yield dataset item
            yield AOEx_DatasetItem(
                sentence=sent, 
                aspect_spans=aspect_spans, 
                opinion_spans=opinion_spans
            )

    # yield train and test items
    yield_train_items = lambda self: self.yield_items(
        aspect_fname=FilePath(
            "SemEval2015-Task12/aspectTerm_res15", 
            "https://raw.githubusercontent.com/happywwy/Coupled-Multi-layer-Attentions/master/util/data_semEval/aspectTerm_res15"
        ),
        sent_opinion_fname=FilePath(
            "SemEval2015-Task12/sentence_res15_op", 
            "https://raw.githubusercontent.com/happywwy/Coupled-Multi-layer-Attentions/master/util/data_semEval/sentence_res15_op"
        )
    )
    yield_eval_items = lambda self: self.yield_items(
        aspect_fname=FilePath(
            "SemEval2015-Task12/aspectTerm_restest15", 
            "https://raw.githubusercontent.com/happywwy/Coupled-Multi-layer-Attentions/master/util/data_semEval/aspectTerm_restest15"
        ),
        sent_opinion_fname=FilePath(
            "SemEval2015-Task12/sentence_restest15_op", 
            "https://raw.githubusercontent.com/happywwy/Coupled-Multi-layer-Attentions/master/util/data_semEval/sentence_restest15_op"
        )
    )
class SemEval2015Task12_AspectPolarity(__SemEval2015Task12):
    """ Dataset for the SemEval2014 Task4 data for Aspect-based Sentiment Analysis
        Download: http://alt.qcri.org/semeval2015/task12/index.php?id=data-and-tools
    """

    LABELS = ['positive', 'neutral', 'negative']
    TRAIN_FILE = FilePath(
        "SemEval2015-Task12/ABSA-15_Restaurants_Train_Final.xml",
        "https://raw.githubusercontent.com/peace195/aspect-based-sentiment-analysis/master/data/ABSA_SemEval2015/Restaurants_Train_Final.xml"
    )
    EVAL_FILE = FilePath(
        "SemEval2015-Task12/ABSA15_Restaurants_Test.xml",
        "https://raw.githubusercontent.com/peace195/aspect-based-sentiment-analysis/master/data/ABSA_SemEval2015/Restaurants_Test.xml"
    )

    n_train_items = lambda self: 833
    n_eval_items = lambda self: 402

    def yield_items(self, fpath: str) -> iter:
        # parse xml file
        tree = ET.parse(fpath)
        root = tree.getroot()
        # parse all reviews
        for review in root:
            for sent in review[0].findall('sentence'):
                # get sentence
                text = sent.find('text').text
                # find opinions
                opinions = sent.find('Opinions')
                if opinions is None:
                    continue
                # get aspects and sentiments
                aspects = [(int(o.attrib['from']), int(o.attrib['to']))
                           for o in opinions]
                sentiments = [o.attrib['polarity'] for o in opinions]
                # remove unvalids - no aspect target
                sentiments = [
                    s for s, (b, e) in zip(sentiments, aspects) if b < e
                ]
                aspects = [(b, e) for (b, e) in aspects if b < e]
                # no aspects found
                if len(aspects) == 0:
                    continue

                # build dataset item
                yield NEC_DatasetItem(
                    sentence=text,
                    entity_spans=aspects,
                    labels=[
                        SemEval2015Task12_AspectPolarity.LABELS.index(s)
                        for s in sentiments
                    ])
class SemEval2015Task12_OpinionPolarity(__SemEval2015Task12):
    """ Dataset for the SemEval2014 Task4 data for Opinion-based Sentiment Analysis
        Downlaod: https://github.com/happywwy/Coupled-Multi-layer-Attentions/tree/master/util/data_semEval
    """

    LABELS = ['positive', 'negative']
    TRAIN_FILE = FilePath(
        "SemEval2015-Task12/sentence_res15_op",
        "https://raw.githubusercontent.com/happywwy/Coupled-Multi-layer-Attentions/master/util/data_semEval/sentence_res15_op"
    )
    EVAL_FILE = FilePath(
        "SemEval2015-Task12/sentence_restest15_op",
        "https://raw.githubusercontent.com/happywwy/Coupled-Multi-layer-Attentions/master/util/data_semEval/sentence_restest15_op"
    )

    n_train_items = lambda self: 760
    n_eval_items = lambda self: 333

    def yield_items(self, fpath: str) -> iter:
        # load file content
        with open(fpath, 'r', encoding='utf-8') as f:
            all_sents_opinions = f.read().split('\n')
        # preprocess data
        for sent_opinions in all_sents_opinions:
            # no opinions
            if '##' not in sent_opinions:
                continue
            # separate sentence from opinions
            sent, opinions = sent_opinions.split('##')
            # get aspects and opinions
            opinions = [o.strip() for o in opinions.split(',')
                        ] if len(opinions) > 0 else []
            opinions, sentiments = zip(*[(o[:-2].strip(), o[-2:])
                                         for o in opinions])
            # build opinion spans
            opinion_pos = [sent.find(o) for o in opinions]
            opinion_spans = [(i, i + len(o))
                             for i, o in zip(opinion_pos, opinions)]
            # get sentiment labels
            sentiments = [(-int(i) + 1) // 2 for i in sentiments]
            # build dataset item
            yield NEC_DatasetItem(sentence=sent,
                                  entity_spans=opinion_spans,
                                  labels=sentiments)
Ejemplo n.º 6
0
class SemEval2010Task8(RelExDataset):
    """ SemEval2010 Task8 Dataset
        Download: https://github.com/sahitya0000/Relation-Classification/blob/master/corpus/SemEval2010_task8_all_data.zip
    """

    # training and testing files
    TRAIN_FILE = FilePath(
        "SemEval2010-Task8/SemEval2010_task8_training/TRAIN_FILE.TXT",
        "https://raw.githubusercontent.com/sahitya0000/Relation-Classification/master/corpus/SemEval2010_task8_training/TRAIN_FILE.TXT"
    )
    EVAL_FILE = FilePath(
        "SemEval2010-Task8/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT",
        "https://raw.githubusercontent.com/sahitya0000/Relation-Classification/master/corpus/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT"
    )
    # set of valid labels
    LABELS = [
        "Other", "Component-Whole(e2,e1)", "Component-Whole(e1,e2)",
        "Instrument-Agency(e2,e1)", "Instrument-Agency(e1,e2)",
        "Member-Collection(e2,e1)", "Member-Collection(e1,e2)",
        "Cause-Effect(e2,e1)", "Cause-Effect(e1,e2)",
        "Entity-Destination(e2,e1)", "Entity-Destination(e1,e2)",
        "Content-Container(e2,e1)", "Content-Container(e1,e2)",
        "Message-Topic(e2,e1)", "Message-Topic(e1,e2)",
        "Product-Producer(e2,e1)", "Product-Producer(e1,e2)",
        "Entity-Origin(e2,e1)", "Entity-Origin(e1,e2)"
    ]

    n_train_items = lambda self: 8000
    n_eval_items = lambda self: 2717

    # yield training and evaluation items
    yield_train_items = lambda self: self.yield_item(
        self.data_base_dir / SemEval2010Task8.TRAIN_FILE)
    yield_eval_items = lambda self: self.yield_item(self.data_base_dir /
                                                    SemEval2010Task8.EVAL_FILE)

    def yield_item(self, fpath: str) -> iter:
        # load data
        with open(fpath, 'r', encoding='utf-8') as f:
            lines = f.read().strip().split('\n')

        # read examples
        for sent_line, relation_line in zip(lines[::4], lines[1::4]):
            # get text
            sent = sent_line.split('\t')[1].strip()
            # clean up sentence
            assert sent[0] == sent[-1] == '"'
            sent = sent[1:-1]
            # find entities in sentence
            entity_A = re.search(r'<e1>(.*)</e1>', sent)
            entity_B = re.search(r'<e2>(.*)</e2>', sent)
            # get spans from matches with markers
            entity_span_A = (entity_A.start(), entity_A.end() - 4 - 5)
            entity_span_B = (entity_B.start(), entity_B.end() - 4 - 5)
            if entity_span_A[0] < entity_span_B[0]:
                entity_span_B = (entity_span_B[0] - 4 - 5,
                                 entity_span_B[1] - 4 - 5)
            else:
                entity_span_A = (entity_span_A[0] - 4 - 5,
                                 entity_span_A[1] - 4 - 5)
            # remove markers from text
            sent = re.sub(r'<(/?)e1>', '', sent)
            sent = re.sub(r'<(/?)e2>', '', sent)
            # get label
            label = relation_line.strip()

            # yield features
            yield RelExDatasetItem(
                sentence=sent,
                source_entity_span=entity_span_A,
                target_entity_span=entity_span_B,
                relation_type=SemEval2010Task8.LABELS.index(label))