Ejemplo n.º 1
0
def main() -> None:
    tripadvisor_train = LIBRARY_DATASETS / 'absa' / \
        'tripadvisor_co_uk-travel_restaurant_reviews_sample_2000_train.csv'

    parser = argparse.ArgumentParser(description='ABSA Train')
    parser.add_argument('--rerank-model',
                        type=validate_existing_filepath,
                        default=None,
                        help='Path to rerank model .h5 file')

    group = parser.add_mutually_exclusive_group()
    group.add_argument('--data',
                       type=validate_existing_path,
                       default=tripadvisor_train,
                       help='Path to raw data (directory or txt/csv file)')
    group.add_argument('--parsed-data',
                       type=validate_existing_directory,
                       default=None,
                       help='Path to parsed data directory')
    args = parser.parse_args()

    train = TrainSentiment(parse=not args.parsed_data,
                           rerank_model=args.rerank_model)
    opinion_lex, aspect_lex = train.run(data=args.data,
                                        parsed_data=args.parsed_data)

    print('Aspect Lexicon: {}\n'.format(aspect_lex) + '=' * 40 + '\n')
    print('Opinion Lexicon: {}'.format(opinion_lex))
Ejemplo n.º 2
0
def main() -> None:
    lib_root = Path(path.realpath(__file__)).parent.parent.parent.parent
    tripadvisor_train = (
        lib_root / "datasets" / "absa" /
        "tripadvisor_co_uk-travel_restaurant_reviews_sample_2000_train.csv")

    parser = argparse.ArgumentParser(description="ABSA Train")
    parser.add_argument(
        "--rerank-model",
        type=validate_existing_filepath,
        default=None,
        help="Path to rerank model .h5 file",
    )

    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        "--data",
        type=validate_existing_path,
        default=tripadvisor_train,
        help="Path to raw data (directory or txt/csv file)",
    )
    group.add_argument(
        "--parsed-data",
        type=validate_existing_directory,
        default=None,
        help="Path to parsed data directory",
    )
    args = parser.parse_args()

    train = TrainSentiment(parse=not args.parsed_data,
                           rerank_model=args.rerank_model)
    opinion_lex, aspect_lex = train.run(data=args.data,
                                        parsed_data=args.parsed_data)

    print(f"Done.\nAspect and Opinion lexicons written to {LEXICONS_OUT}.")
Ejemplo n.º 3
0
    def train_file_callback(attr, old, new):
        global train_data
        SENTIMENT_OUT.mkdir(parents=True, exist_ok=True)
        train = TrainSentiment(parse=True, rerank_model=None)
        if len(train_src.data['file_contents']) == 1:
            train_data = read_csv(train_src, index_cols=0)
            file_name = train_src.data['file_name'][0]
            raw_data_path = SENTIMENT_OUT / file_name
            train_data.to_csv(raw_data_path, header=False)
            print(f'Running_SentimentTraining on data...')
            train.run(data=raw_data_path)
        else:
            f_contents = train_src.data['file_contents']
            f_names = train_src.data['file_name']
            raw_data_path = SENTIMENT_OUT / train_src.data['file_name'][
                0].split('/')[0]
            if not os.path.exists(raw_data_path):
                os.makedirs(raw_data_path)
            for f_content, f_name in zip(f_contents, f_names):
                read_parsed_files(f_content, f_name)
            print(f'Running_SentimentTraining on data...')
            train.run(parsed_data=raw_data_path)

        text_status.value = "Lexicon extraction completed"

        with io.open(AcquireTerms.acquired_aspect_terms_path, "r") as fp:
            aspect_data_csv = fp.read()
        file_data = base64.b64encode(str.encode(aspect_data_csv))
        file_data = file_data.decode("utf-8")
        asp_src.data = {
            'file_contents': [file_data],
            'file_name': ['nameFile.csv']
        }

        out_path = RerankTerms.out_dir / 'generated_opinion_lex_reranked.csv'
        with io.open(out_path, "r") as fp:
            opinion_data_csv = fp.read()
        file_data = base64.b64encode(str.encode(opinion_data_csv))
        file_data = file_data.decode("utf-8")
        op_src.data = {
            'file_contents': [file_data],
            'file_name': ['nameFile.csv']
        }
clothing_train = os.path.join(args.data_folder,
                              'clothing_data/clothing_absa_train_small.csv')

if args.large == 'yes':
    print(f'Using large dataset: clothing_data/clothing_absa_train.csv')
    clothing_train = os.path.join(args.data_folder,
                                  'clothing_data/clothing_absa_train.csv')
else:
    print(f'Using small dataset: clothing_data/clothing_absa_train_small.csv')
    clothing_train = os.path.join(
        args.data_folder, 'clothing_data/clothing_absa_train_small.csv')

os.makedirs('outputs', exist_ok=True)

train = TrainSentiment(asp_thresh=args.asp_thresh,
                       op_thresh=args.op_thresh,
                       max_iter=args.max_iter)

opinion_lex, aspect_lex = train.run(data=clothing_train, out_dir='./outputs')

#Copy lexicons to outputs folder
asp_lex = shutil.copy(LEXICONS_OUT / 'generated_aspect_lex.csv', './outputs')
op_lex = shutil.copy(LEXICONS_OUT / 'generated_opinion_lex_reranked.csv',
                     './outputs')


# Evaluation
# Although ABSA is an unsupervised method it can be metriced with a small sample of labeled data
def doc2IO(doc):
    """
    Converts ABSA doc to IO span format for evaluation