Beispiel #1
0
# from tensorflow.compat.v1 import ConfigProto
# from tensorflow.compat.v1 import InteractiveSession
#
# # memory explodes without this
# config = ConfigProto()
# config.gpu_options.per_process_gpu_memory_fraction = 1.0
# config.gpu_options.allow_growth = True
# session = InteractiveSession(config=config)

from transformers import pipeline

sentiment_pipeline = pipeline('sentiment-analysis')


def evaluate_sentiment(sentence):
    try:
        pipeline = sentiment_pipeline(sentence)
    except:
        print("evaluating {}".format(sentence), end='')
        print("Oh man")

    return pipeline


if __name__ == '__main__':
    from concurrent.futures.thread import ThreadPoolExecutor
    from tqdm import tqdm
    from random import shuffle

    # three-letter animal names for benchmark
    sources = [
 def test_torch_zero_shot_classification(self):
     for model_name in TEXT_CLASSIF_FINETUNED_MODELS:
         nlp = pipeline(task="zero-shot-classification",
                        model=model_name,
                        tokenizer=model_name)
         self._test_zero_shot_pipeline(nlp)
 def test_torch_conversation(self):
     for model_name in DIALOGUE_FINETUNED_MODELS:
         nlp = pipeline(task="conversational",
                        model=model_name,
                        tokenizer=model_name)
         self._test_conversation_pipeline(nlp)
 def test_torch_ner(self):
     mandatory_keys = {"entity", "word", "score"}
     for model_name in NER_FINETUNED_MODELS:
         nlp = pipeline(task="ner", model=model_name, tokenizer=model_name)
         self._test_ner_pipeline(nlp, mandatory_keys)
 def test_torch_feature_extraction(self):
     for model_name in FEATURE_EXTRACT_FINETUNED_MODELS:
         nlp = pipeline(task="feature-extraction",
                        model=model_name,
                        tokenizer=model_name)
         self._test_mono_column_pipeline(nlp, VALID_INPUTS, {})
Beispiel #6
0
import pandas as pd
import random
import re
import requests
from nltk.corpus import stopwords
from stemming.porter2 import stem
from transformers import pipeline

FAST_GEO = pd.read_csv("data/geo_uri_label_utf8.csv", names=["URI", "Label"])
FAST_TOPICS = pd.read_csv("data/topic_uri_label_utf8.csv",
                          names=["URI", "Label"])

for fast_df in [FAST_GEO, FAST_TOPICS]:
    fast_df["stemmed"] = fast_df["Label"].apply(lambda x: stem(x.lower()))

NER = pipeline("ner")
special_char_re = re.compile(r'[^a-zA-Z]')
stop_words_list = stopwords.words('english')


def cleanup(term: str) -> str:
    cleaned = []
    for char in term.split():
        cleaned_char = special_char_re.sub(' ', char).lower()
        if cleaned_char in stop_words_list:
            continue
        cleaned.append(cleaned_char)
    return ' '.join(cleaned)


def create_datasets(etd_path: str):
Beispiel #7
0
from transformers import pipeline
from textblob import TextBlob
from dotenv import load_dotenv
import pymongo
import requests
import matplotlib.pyplot as plt
from tqdm import tqdm

load_dotenv()

# ## Text Generation

# In[7]:

generator = pipeline("text-generation", framework="pt")

# In[31]:


def generate_article_from_title(title: str) -> str:
    article = generator(title,
                        max_length=random.randint(500,
                                                  1500))[0]["generated_text"]
    if "." in article:
        return article[article.index(".") + 1:]
    return article


# ## Summarisation
Beispiel #8
0
def get_qa_pipeline():
    qa_pipeline = pipeline("question-answering")
    return qa_pipeline
Beispiel #9
0
def answer_question(pipeline, question, context):
    result = pipeline(question=question, context=context)
    #return f"Answer: {result['answer']}, score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}"
    return result
Beispiel #10
0
# coding: utf-8

# In[1]:

import argparse
import azure.cognitiveservices.speech as speechsdk
from transformers import pipeline
summarizer = pipeline("summarization")

# Creates an instance of a speech config with specified subscription key and service region.
# Replace with your own subscription key and region.
speech_key, service_region = "3021013d1649482f91008c7df0a0d971", "centralindia"
speech_config = speechsdk.SpeechConfig(subscription=speech_key,
                                       region=service_region)

# In[2]:
parser = argparse.ArgumentParser()
# set up training configuration.
parser.add_argument('--filename', type=str)
args = parser.parse_args()


def summarize_pipline(audio, chunks_output_folder='audio_chunks'):
    get_audio_chunks(audio)
    transcipt = transcribe_each_chunk(chunks_output_folder)
    summary = summarize(transcipt)
    return summary


# In[4]:
from bs4 import BeautifulSoup

csv_file = 'refract_data_engineering_articles.csv'

# read the articles from the csv file and put them in the list
articles = []
with open(csv_file, 'r') as csv_in:
    csv_reader = csv.reader(csv_in, delimiter=',')
    for row_num, row_content in enumerate(csv_reader):
        if row_num != 0:
            articles.append(row_content)

# run the articles through the model and get the summaries
print('Summarizing the assignment articles...')
summaries = []
summarizer = pipeline('summarization', model='t5-small', tokenizer='t5-small')
for article in articles:
    summaries.append(
        summarizer(article,
                   max_length=20,
                   min_length=0,
                   do_sample=False,
                   clean_up_tokenization_spaces=True))
    print('...')

# store the summaries in the csv file
with open(csv_file, 'w') as csv_out:
    csv_writer = csv.writer(csv_out, lineterminator='\n')
    new_content = [['source_content', 'ml_summary']]
    for i, summary in enumerate(summaries):
        new_content.append([articles[i][0], summary[0]['summary_text']])
from elasticsearch import Elasticsearch
from flask import Flask, render_template, jsonify, request
from transformers import pipeline, AlbertTokenizer, AlbertForQuestionAnswering

import glob
import textract
import os
import torch
import json
import collections

indexName = 'triviaqa_wiki'

#Init Albert pipeline
qa_pipeline = pipeline('question-answering',
                       model="ktrapeznikov/albert-xlarge-v2-squad-v2",
                       tokenizer="albert-xlarge-v2",
                       device=0)
question = "What is the capital of the Netherlands?"
context = r"The four largest cities in the Netherlands are Amsterdam, Rotterdam, The Hague and Utrecht.[17] Amsterdam is the country's most populous city and nominal capital,[18] while The Hague holds the seat of the States General, Cabinet and Supreme Court.[19] The Port of Rotterdam is the busiest seaport in Europe, and the busiest in any country outside East Asia and Southeast Asia, behind only China and Singapore."

#initQA = qa_pipeline(question=question, context=context)

#print(initQA)

# Grab Elasticsearch instance
config = {'host': 'mc.ocbe.de', 'port': 9200}
es = Elasticsearch([config])

# test connection
es.ping()
from transformers import pipeline

qa_model = pipeline('question-answering')
from flask import Flask, render_template, request

app = Flask(__name__)

from transformers import pipeline
#from transformers import PreTrainedModel
#from transformers import  TFT5Model, TFBertModel
import requests
import pprint
import time

summarizer_bart = pipeline("summarization")
summarizer_t5 = pipeline("summarization", model="t5-base")


class models:
    def __init__(self, text, length):
        self.length = length
        self.text = text

    def bart(self):
        return (list(
            summarizer_bart(self.text,
                            min_length=self.length,
                            max_length=self.length + 10)[0].values())[0])

    def t5(self):
        return (list(
            summarizer_t5(self.text,
                          min_length=self.length,
                          max_length=self.length + 10)[0].values())[0])
import transformers
from transformers import pipeline
f = open('micro_soft.txt', 'r')
text = f.read()


def summary_utils(summary):
    summar = summary[0]['summary_text']
    return summar


##abstract summarizer

#T5 model
summarizer = pipeline('summarization', model="t5-base")
summary_t5_20 = summarizer(text, min_length=5, max_length=20)
summary_t10_30 = summarizer(text, min_length=10, max_length=30)
summary_t_bigger = summarizer(text, min_length=100, max_length=150)

print(text)
print(summary_utils(summary_t5_20))
print(summary_utils(summary_t10_30))
print(summary_utils(summary_t_bigger))
'''
summary_t_create = summarizer(text, min_length = 500, max_length = 1000)
[{'summary_text': 'Microsoft has launched Intelligent Cloud Hub to empower 
 students with AI-ready skills . the three-year collaborative program will 
 support around 100 institutions with AI infrastructure, course content and 
 curriculum, developer support, development tools and give students access 
 to cloud and AI services . as part of the program, the redmond giant will 
Beispiel #16
0
def translate(text):
    translators = pipeline('translation_en_to_de')
    text = translators(text)
    return text
    process.process_pdf_folder(f"{args.pdf_folder}",
                               f"{args.pdf_folder}/pdf_to_text")

    ## This function then adds meta data (from original bibinfo files) to the json files
    process.combine_json_and_bibinfo(f"{args.pdf_folder}/pdf_to_text",
                                     f"{args.pdf_folder}")

    # Extraction

    tokenizer = BertTokenizer.from_pretrained(args.model,
                                              do_basic_tokenize=False,
                                              do_lower_case=False)

    ner = pipeline(task='ner',
                   framework='pt',
                   model=args.model,
                   tokenizer=tokenizer,
                   grouped_entities=True)

    ner.ignore_labels = []

    nlp = English()
    sentencizer = nlp.create_pipe("sentencizer")
    nlp.add_pipe(sentencizer)

    json_files = (
        glob.glob(f"{args.pdf_folder}/pdf_to_text/sections_json/*.json"))

    for file in json_files:
        with open(file) as f:
            json_data = json.load(f)
Beispiel #18
0
    def __init__(self, config):
        device = 0 if torch.cuda.is_available() else -1
        print(f"using device: {'cuda' if device == 0 else 'cpu'}")

        self.analyzer = pipeline(task="sentiment-analysis", device=device)
        self.summarizer = pipeline(task="summarization", device=device)
import json
import os

from flask import Flask as Flask, send_from_directory, request, Response, redirect, url_for
#### only needed for cross-origin requests:
# from flask_cors import CORS
from transformers import pipeline

__author__ = 'Hendrik Strobelt'

app = Flask(__name__)
#### only needed for cross-origin requests:
# CORS(app)

# load sentiment analysis from huggingface
nlp = pipeline('sentiment-analysis')


# redirect requests from root to index.html
@app.route('/')
def hello_world():
    return redirect('client/index.html')


# functional backend taking sentences as request and returning
# sentiment direction and score as JSON result
@app.route('/api/sentiment', methods=['POST'])
def sentiment():
    sentences = request.json['sentences']

    results = [nlp(x) for x in sentences]
 def __init__(self):
     self.summarizer_bart = pipeline('summarization', model='facebook/bart-large-cnn',
                                     tokenizer='facebook/bart-large-cnn')
     self.summarizer_T5 = T5ForConditionalGeneration.from_pretrained('t5-base')
     self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
Beispiel #21
0
import os
from argparse import ArgumentParser
from transformers import pipeline

parser = ArgumentParser()
parser.add_argument('-m', '--model', default='gpt2-small-dutch')
parser.add_argument('-d', '--device', type=int, default=-1)
args = parser.parse_args()

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

p = pipeline('text-generation', model=args.model, device=args.device)

while True:
    txt = input('prompt > ')
    txt = p(txt)[0]['generated_text']
    print(txt)
Beispiel #22
0
dataset_args = container.G({
    "predict_data_nums": 3000,
    "predict_text_path":
    "/home/ubuntu/likun/nlp_data/zsl/BenchmarkingZeroShot/topic_yahoo/test.csv",
    "predict_use_labels": "all",
    "predict_single_label_max_data_nums": "all",
    "max_length": 128
})
predict_dataset = TopicYahoo(
    text_path=dataset_args.predict_text_path,
    tokenizer=None,
    max_length=dataset_args.max_length,
    data_nums=dataset_args.predict_data_nums,
    use_labels=dataset_args.predict_use_labels,
    single_label_max_data_nums=dataset_args.predict_single_label_max_data_nums)
predict_dataset.build_annotations(predict_dataset.classify_data_build)
classifier = pipeline(
    'zero-shot-classification',
    model='/home/ubuntu/likun/nlp_pretrained/bart-large-mnli',
    device=1)
predict_labels = []
true_labels = []
for index, item in tqdm.tqdm(predict_dataset.annotations.items(),
                             desc="Testing"):
    res = classifier(item['text'], TopicYahoo.labels)
    true_labels.append(item['label_name'])
    predict_labels.append(res['labels'][0].lower())
report = classification_report(true_labels, predict_labels)
print(report)
 def test_pt_defaults(self):
     # Test that pipelines can be correctly loaded without any argument
     for task in self.pipelines:
         with self.subTest(msg="Testing Torch defaults with PyTorch and {}".
                           format(task)):
             pipeline(task, framework="pt")
        match = match.astype(int)
        print('For ' + str(all_actors[aa]) + ' we have a match of ' +
              str(int(np.mean(match) * 100)) + '% and ' +
              str(len(mydata_test_actors['from'] == all_actors[aa])) +
              ' messages')

#%% interesting ML to add

mydata1 = mydata.iloc[0:2170]
mydata1.append(mydata.iloc[2172:2953])

mydata = mydata1

from transformers import pipeline

nlp = pipeline("sentiment-analysis")

#result = nlp("Ti odio merda")[0]
#print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
#
#result = nlp("Caterina mi ha comprato una torta, che dolcissima!")[0]
#print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

mydata['positiveness'] = ['' for i in range(len(mydata['text']))]
mydata['score'] = [0.0 for i in range(len(mydata['text']))]
mydata['positiveness_score'] = [0.0 for i in range(len(mydata['text']))]

for ii in range(len(mydata['text'])):
    if ii % 100 == 0:
        print("Training for sentiment analysis ..." +
              str(int(ii / len(mydata['text']) * 100)) + "% completed ")
 def test_integration_torch_summarization(self):
     nlp = pipeline(task="summarization", device=DEFAULT_DEVICE_NUM)
     cnn_article = ' (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes. CNN\'s Vasco Cotovio, Kareem Khadder and Faith Karimi contributed to this report.'
     expected_cnn_summary = " The Palestinian Authority becomes the 123rd member of the International Criminal Court . The move gives the court jurisdiction over alleged crimes in Palestinian territories . Israel and the United States opposed the Palestinians' efforts to join the court . Rights group Human Rights Watch welcomes the move, says governments seeking to penalize Palestine should end pressure ."
     result = nlp(cnn_article)
     self.assertEqual(result[0]["summary_text"], expected_cnn_summary)
Beispiel #26
0
    parser.add_argument('--input', type=str, required=True, help='Location of Output File')
    args = parser.parse_args()
    print('Entered', args.iter)
    
    #Read data
    input_file = args.input
    #input_file = 'sentiment_mal_train_binary.pkl'
    #input_file = 'processed_data_binary.pkl'
    print('Input File:', input_file)
    sentences, labels, tweet_ids = read_data(input_file)


    #Define classifier
    print()
    if args.iter == '0':
        classifier = pipeline('sentiment-analysis', 'cardiffnlp/twitter-roberta-base-sentiment', device = 0)
    else:
        saved_model = 'iteration' + args.iter + '/saved_model'
        print('Model Directory:', saved_model)
        tokenizer = AutoTokenizer.from_pretrained(saved_model)
        model = AutoModelForSequenceClassification.from_pretrained(saved_model)
        classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, device = 0)
    
    pred_label_score = predict(sentences, labels, tweet_ids, classifier)
    print()


    #Save results
    output_file = args.output + '/iteration_' + args.iter + '.pkl'
    print('Output File:', output_file)
    save_data(output_file, pred_label_score)
 def test_tf_zero_shot_outputs(self):
     nlp = pipeline(task="zero-shot-classification",
                    model="roberta-large-mnli",
                    framework="tf")
     self._test_zero_shot_pipeline_outputs(nlp)
Beispiel #28
0
from transformers import pipeline

ner = pipeline('ner')
sequence = "This story is written by Raoof Naushad from India working at Accubits Technologies."
print(ner(sequence))
 def test_torch_question_answering(self):
     for model_name in QA_FINETUNED_MODELS:
         nlp = pipeline(task="question-answering",
                        model=model_name,
                        tokenizer=model_name)
         self._test_qa_pipeline(nlp)
Beispiel #30
0
from transformers import pipeline, AutoModelForMaskedLM, AutoTokenizer
from random import randint
from collections import Counter
import torch

# Sofiane: there are two ways to create an unmasker:
# Method 1: can be used in huggingFace starting from a given version I don't
# remember.
unmasker = pipeline('fill-mask', model='./models/215000')

# Method 2: you can create it manually in case you need specific tokenizer.
# model = AutoModelForMaskedLM.from_pretrained('./models/215000')
# tokenizer = AutoTokenizer.from_pretrained("./models/215000")
# unmasker = pipeline('fill-mask', model=model, tokenizer=tokenizer)

# take a random trajectory traj:
traj = ("8a536bc8b697fff 8a536b524aeffff 8a536b524337fff " +\
"8a536b5243a7fff " +\
"8a536bc8942ffff 8a536bc8972ffff 8a536bc89367fff 8a536bcd4197fff "+\
"8a536bcd4a57fff 8a536bcd5d97fff").split(' ')

# Insert MASK at a preferred position, in my example it is position 5 (0-4)
pos = 4
traj.insert(pos, '[MASK]')
print('INPUT:', ' '.join(traj))

# Sofiane: Here I'll perform 15 successive predictions using beam search. the
# idea is that I'll start requesting a prediction for the cell in position 4,
# then iteratively append the the predicted value to the query and push MASK to
# i+1. If predicted hex == previous hex (the one before MASK), take second
# prediction.