# from tensorflow.compat.v1 import ConfigProto # from tensorflow.compat.v1 import InteractiveSession # # # memory explodes without this # config = ConfigProto() # config.gpu_options.per_process_gpu_memory_fraction = 1.0 # config.gpu_options.allow_growth = True # session = InteractiveSession(config=config) from transformers import pipeline sentiment_pipeline = pipeline('sentiment-analysis') def evaluate_sentiment(sentence): try: pipeline = sentiment_pipeline(sentence) except: print("evaluating {}".format(sentence), end='') print("Oh man") return pipeline if __name__ == '__main__': from concurrent.futures.thread import ThreadPoolExecutor from tqdm import tqdm from random import shuffle # three-letter animal names for benchmark sources = [
def test_torch_zero_shot_classification(self): for model_name in TEXT_CLASSIF_FINETUNED_MODELS: nlp = pipeline(task="zero-shot-classification", model=model_name, tokenizer=model_name) self._test_zero_shot_pipeline(nlp)
def test_torch_conversation(self): for model_name in DIALOGUE_FINETUNED_MODELS: nlp = pipeline(task="conversational", model=model_name, tokenizer=model_name) self._test_conversation_pipeline(nlp)
def test_torch_ner(self): mandatory_keys = {"entity", "word", "score"} for model_name in NER_FINETUNED_MODELS: nlp = pipeline(task="ner", model=model_name, tokenizer=model_name) self._test_ner_pipeline(nlp, mandatory_keys)
def test_torch_feature_extraction(self): for model_name in FEATURE_EXTRACT_FINETUNED_MODELS: nlp = pipeline(task="feature-extraction", model=model_name, tokenizer=model_name) self._test_mono_column_pipeline(nlp, VALID_INPUTS, {})
import pandas as pd import random import re import requests from nltk.corpus import stopwords from stemming.porter2 import stem from transformers import pipeline FAST_GEO = pd.read_csv("data/geo_uri_label_utf8.csv", names=["URI", "Label"]) FAST_TOPICS = pd.read_csv("data/topic_uri_label_utf8.csv", names=["URI", "Label"]) for fast_df in [FAST_GEO, FAST_TOPICS]: fast_df["stemmed"] = fast_df["Label"].apply(lambda x: stem(x.lower())) NER = pipeline("ner") special_char_re = re.compile(r'[^a-zA-Z]') stop_words_list = stopwords.words('english') def cleanup(term: str) -> str: cleaned = [] for char in term.split(): cleaned_char = special_char_re.sub(' ', char).lower() if cleaned_char in stop_words_list: continue cleaned.append(cleaned_char) return ' '.join(cleaned) def create_datasets(etd_path: str):
from transformers import pipeline from textblob import TextBlob from dotenv import load_dotenv import pymongo import requests import matplotlib.pyplot as plt from tqdm import tqdm load_dotenv() # ## Text Generation # In[7]: generator = pipeline("text-generation", framework="pt") # In[31]: def generate_article_from_title(title: str) -> str: article = generator(title, max_length=random.randint(500, 1500))[0]["generated_text"] if "." in article: return article[article.index(".") + 1:] return article # ## Summarisation
def get_qa_pipeline(): qa_pipeline = pipeline("question-answering") return qa_pipeline
def answer_question(pipeline, question, context): result = pipeline(question=question, context=context) #return f"Answer: {result['answer']}, score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}" return result
# coding: utf-8 # In[1]: import argparse import azure.cognitiveservices.speech as speechsdk from transformers import pipeline summarizer = pipeline("summarization") # Creates an instance of a speech config with specified subscription key and service region. # Replace with your own subscription key and region. speech_key, service_region = "3021013d1649482f91008c7df0a0d971", "centralindia" speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) # In[2]: parser = argparse.ArgumentParser() # set up training configuration. parser.add_argument('--filename', type=str) args = parser.parse_args() def summarize_pipline(audio, chunks_output_folder='audio_chunks'): get_audio_chunks(audio) transcipt = transcribe_each_chunk(chunks_output_folder) summary = summarize(transcipt) return summary # In[4]:
from bs4 import BeautifulSoup csv_file = 'refract_data_engineering_articles.csv' # read the articles from the csv file and put them in the list articles = [] with open(csv_file, 'r') as csv_in: csv_reader = csv.reader(csv_in, delimiter=',') for row_num, row_content in enumerate(csv_reader): if row_num != 0: articles.append(row_content) # run the articles through the model and get the summaries print('Summarizing the assignment articles...') summaries = [] summarizer = pipeline('summarization', model='t5-small', tokenizer='t5-small') for article in articles: summaries.append( summarizer(article, max_length=20, min_length=0, do_sample=False, clean_up_tokenization_spaces=True)) print('...') # store the summaries in the csv file with open(csv_file, 'w') as csv_out: csv_writer = csv.writer(csv_out, lineterminator='\n') new_content = [['source_content', 'ml_summary']] for i, summary in enumerate(summaries): new_content.append([articles[i][0], summary[0]['summary_text']])
from elasticsearch import Elasticsearch from flask import Flask, render_template, jsonify, request from transformers import pipeline, AlbertTokenizer, AlbertForQuestionAnswering import glob import textract import os import torch import json import collections indexName = 'triviaqa_wiki' #Init Albert pipeline qa_pipeline = pipeline('question-answering', model="ktrapeznikov/albert-xlarge-v2-squad-v2", tokenizer="albert-xlarge-v2", device=0) question = "What is the capital of the Netherlands?" context = r"The four largest cities in the Netherlands are Amsterdam, Rotterdam, The Hague and Utrecht.[17] Amsterdam is the country's most populous city and nominal capital,[18] while The Hague holds the seat of the States General, Cabinet and Supreme Court.[19] The Port of Rotterdam is the busiest seaport in Europe, and the busiest in any country outside East Asia and Southeast Asia, behind only China and Singapore." #initQA = qa_pipeline(question=question, context=context) #print(initQA) # Grab Elasticsearch instance config = {'host': 'mc.ocbe.de', 'port': 9200} es = Elasticsearch([config]) # test connection es.ping()
from transformers import pipeline qa_model = pipeline('question-answering')
from flask import Flask, render_template, request app = Flask(__name__) from transformers import pipeline #from transformers import PreTrainedModel #from transformers import TFT5Model, TFBertModel import requests import pprint import time summarizer_bart = pipeline("summarization") summarizer_t5 = pipeline("summarization", model="t5-base") class models: def __init__(self, text, length): self.length = length self.text = text def bart(self): return (list( summarizer_bart(self.text, min_length=self.length, max_length=self.length + 10)[0].values())[0]) def t5(self): return (list( summarizer_t5(self.text, min_length=self.length, max_length=self.length + 10)[0].values())[0])
import transformers from transformers import pipeline f = open('micro_soft.txt', 'r') text = f.read() def summary_utils(summary): summar = summary[0]['summary_text'] return summar ##abstract summarizer #T5 model summarizer = pipeline('summarization', model="t5-base") summary_t5_20 = summarizer(text, min_length=5, max_length=20) summary_t10_30 = summarizer(text, min_length=10, max_length=30) summary_t_bigger = summarizer(text, min_length=100, max_length=150) print(text) print(summary_utils(summary_t5_20)) print(summary_utils(summary_t10_30)) print(summary_utils(summary_t_bigger)) ''' summary_t_create = summarizer(text, min_length = 500, max_length = 1000) [{'summary_text': 'Microsoft has launched Intelligent Cloud Hub to empower students with AI-ready skills . the three-year collaborative program will support around 100 institutions with AI infrastructure, course content and curriculum, developer support, development tools and give students access to cloud and AI services . as part of the program, the redmond giant will
def translate(text): translators = pipeline('translation_en_to_de') text = translators(text) return text
process.process_pdf_folder(f"{args.pdf_folder}", f"{args.pdf_folder}/pdf_to_text") ## This function then adds meta data (from original bibinfo files) to the json files process.combine_json_and_bibinfo(f"{args.pdf_folder}/pdf_to_text", f"{args.pdf_folder}") # Extraction tokenizer = BertTokenizer.from_pretrained(args.model, do_basic_tokenize=False, do_lower_case=False) ner = pipeline(task='ner', framework='pt', model=args.model, tokenizer=tokenizer, grouped_entities=True) ner.ignore_labels = [] nlp = English() sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) json_files = ( glob.glob(f"{args.pdf_folder}/pdf_to_text/sections_json/*.json")) for file in json_files: with open(file) as f: json_data = json.load(f)
def __init__(self, config): device = 0 if torch.cuda.is_available() else -1 print(f"using device: {'cuda' if device == 0 else 'cpu'}") self.analyzer = pipeline(task="sentiment-analysis", device=device) self.summarizer = pipeline(task="summarization", device=device)
import json import os from flask import Flask as Flask, send_from_directory, request, Response, redirect, url_for #### only needed for cross-origin requests: # from flask_cors import CORS from transformers import pipeline __author__ = 'Hendrik Strobelt' app = Flask(__name__) #### only needed for cross-origin requests: # CORS(app) # load sentiment analysis from huggingface nlp = pipeline('sentiment-analysis') # redirect requests from root to index.html @app.route('/') def hello_world(): return redirect('client/index.html') # functional backend taking sentences as request and returning # sentiment direction and score as JSON result @app.route('/api/sentiment', methods=['POST']) def sentiment(): sentences = request.json['sentences'] results = [nlp(x) for x in sentences]
def __init__(self): self.summarizer_bart = pipeline('summarization', model='facebook/bart-large-cnn', tokenizer='facebook/bart-large-cnn') self.summarizer_T5 = T5ForConditionalGeneration.from_pretrained('t5-base') self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
import os from argparse import ArgumentParser from transformers import pipeline parser = ArgumentParser() parser.add_argument('-m', '--model', default='gpt2-small-dutch') parser.add_argument('-d', '--device', type=int, default=-1) args = parser.parse_args() os.environ['TOKENIZERS_PARALLELISM'] = 'false' p = pipeline('text-generation', model=args.model, device=args.device) while True: txt = input('prompt > ') txt = p(txt)[0]['generated_text'] print(txt)
dataset_args = container.G({ "predict_data_nums": 3000, "predict_text_path": "/home/ubuntu/likun/nlp_data/zsl/BenchmarkingZeroShot/topic_yahoo/test.csv", "predict_use_labels": "all", "predict_single_label_max_data_nums": "all", "max_length": 128 }) predict_dataset = TopicYahoo( text_path=dataset_args.predict_text_path, tokenizer=None, max_length=dataset_args.max_length, data_nums=dataset_args.predict_data_nums, use_labels=dataset_args.predict_use_labels, single_label_max_data_nums=dataset_args.predict_single_label_max_data_nums) predict_dataset.build_annotations(predict_dataset.classify_data_build) classifier = pipeline( 'zero-shot-classification', model='/home/ubuntu/likun/nlp_pretrained/bart-large-mnli', device=1) predict_labels = [] true_labels = [] for index, item in tqdm.tqdm(predict_dataset.annotations.items(), desc="Testing"): res = classifier(item['text'], TopicYahoo.labels) true_labels.append(item['label_name']) predict_labels.append(res['labels'][0].lower()) report = classification_report(true_labels, predict_labels) print(report)
def test_pt_defaults(self): # Test that pipelines can be correctly loaded without any argument for task in self.pipelines: with self.subTest(msg="Testing Torch defaults with PyTorch and {}". format(task)): pipeline(task, framework="pt")
match = match.astype(int) print('For ' + str(all_actors[aa]) + ' we have a match of ' + str(int(np.mean(match) * 100)) + '% and ' + str(len(mydata_test_actors['from'] == all_actors[aa])) + ' messages') #%% interesting ML to add mydata1 = mydata.iloc[0:2170] mydata1.append(mydata.iloc[2172:2953]) mydata = mydata1 from transformers import pipeline nlp = pipeline("sentiment-analysis") #result = nlp("Ti odio merda")[0] #print(f"label: {result['label']}, with score: {round(result['score'], 4)}") # #result = nlp("Caterina mi ha comprato una torta, che dolcissima!")[0] #print(f"label: {result['label']}, with score: {round(result['score'], 4)}") mydata['positiveness'] = ['' for i in range(len(mydata['text']))] mydata['score'] = [0.0 for i in range(len(mydata['text']))] mydata['positiveness_score'] = [0.0 for i in range(len(mydata['text']))] for ii in range(len(mydata['text'])): if ii % 100 == 0: print("Training for sentiment analysis ..." + str(int(ii / len(mydata['text']) * 100)) + "% completed ")
def test_integration_torch_summarization(self): nlp = pipeline(task="summarization", device=DEFAULT_DEVICE_NUM) cnn_article = ' (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes. CNN\'s Vasco Cotovio, Kareem Khadder and Faith Karimi contributed to this report.' expected_cnn_summary = " The Palestinian Authority becomes the 123rd member of the International Criminal Court . The move gives the court jurisdiction over alleged crimes in Palestinian territories . Israel and the United States opposed the Palestinians' efforts to join the court . Rights group Human Rights Watch welcomes the move, says governments seeking to penalize Palestine should end pressure ." result = nlp(cnn_article) self.assertEqual(result[0]["summary_text"], expected_cnn_summary)
parser.add_argument('--input', type=str, required=True, help='Location of Output File') args = parser.parse_args() print('Entered', args.iter) #Read data input_file = args.input #input_file = 'sentiment_mal_train_binary.pkl' #input_file = 'processed_data_binary.pkl' print('Input File:', input_file) sentences, labels, tweet_ids = read_data(input_file) #Define classifier print() if args.iter == '0': classifier = pipeline('sentiment-analysis', 'cardiffnlp/twitter-roberta-base-sentiment', device = 0) else: saved_model = 'iteration' + args.iter + '/saved_model' print('Model Directory:', saved_model) tokenizer = AutoTokenizer.from_pretrained(saved_model) model = AutoModelForSequenceClassification.from_pretrained(saved_model) classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, device = 0) pred_label_score = predict(sentences, labels, tweet_ids, classifier) print() #Save results output_file = args.output + '/iteration_' + args.iter + '.pkl' print('Output File:', output_file) save_data(output_file, pred_label_score)
def test_tf_zero_shot_outputs(self): nlp = pipeline(task="zero-shot-classification", model="roberta-large-mnli", framework="tf") self._test_zero_shot_pipeline_outputs(nlp)
from transformers import pipeline ner = pipeline('ner') sequence = "This story is written by Raoof Naushad from India working at Accubits Technologies." print(ner(sequence))
def test_torch_question_answering(self): for model_name in QA_FINETUNED_MODELS: nlp = pipeline(task="question-answering", model=model_name, tokenizer=model_name) self._test_qa_pipeline(nlp)
from transformers import pipeline, AutoModelForMaskedLM, AutoTokenizer from random import randint from collections import Counter import torch # Sofiane: there are two ways to create an unmasker: # Method 1: can be used in huggingFace starting from a given version I don't # remember. unmasker = pipeline('fill-mask', model='./models/215000') # Method 2: you can create it manually in case you need specific tokenizer. # model = AutoModelForMaskedLM.from_pretrained('./models/215000') # tokenizer = AutoTokenizer.from_pretrained("./models/215000") # unmasker = pipeline('fill-mask', model=model, tokenizer=tokenizer) # take a random trajectory traj: traj = ("8a536bc8b697fff 8a536b524aeffff 8a536b524337fff " +\ "8a536b5243a7fff " +\ "8a536bc8942ffff 8a536bc8972ffff 8a536bc89367fff 8a536bcd4197fff "+\ "8a536bcd4a57fff 8a536bcd5d97fff").split(' ') # Insert MASK at a preferred position, in my example it is position 5 (0-4) pos = 4 traj.insert(pos, '[MASK]') print('INPUT:', ' '.join(traj)) # Sofiane: Here I'll perform 15 successive predictions using beam search. the # idea is that I'll start requesting a prediction for the cell in position 4, # then iteratively append the the predicted value to the query and push MASK to # i+1. If predicted hex == previous hex (the one before MASK), take second # prediction.