def preprocess(models_dir, processors, extractive, cnn_dm_file, article_file, summary_file, output_train_files, output_test_files, split_ratio): """ This function goes through the entire preprocessing pipeline resulting in files ready to be used as the input of the graph neural network. :param models_dir: The path to the stanfordnlp directory. :param processors: List of parsers to use. Options: tokenize, mwt, pos, lemma, depparse. :param extractive: Whether to make extractive summary or not. If true,the input file should contain the best ids. :param cnn_dm_file: This file should contain the articles and the summaries in a jsonl format. :param article_file: This file will contain every article graph. :param summary_file: This file will contain every summary graph. :param output_train_files: The paths to save the training files. The first parameter is the training input, the second is the expected output. :param output_test_files: The paths to save the validation files. The first parameter is the validation input, the second is the expected output. :param split_ratio: The ratio of data used for training vs validation. :return: None """ from graph_transformations.preprocessor import main as stanford_preprocess if extractive: from graph_transformations.cnn_extractive_parser import main as cnn_process else: from graph_transformations.cnn_parser import main as cnn_process from graph_transformations.train_test_split import train_test_split import stanfordnlp if not os.path.exists(models_dir): stanfordnlp.download('en', resource_dir=models_dir) correct_processors = ["tokenize", "mwt", "pos", "lemma", "depparse"] incorrect = [i for i in processors if i not in correct_processors] if len(incorrect) != 0: raise ValueError( "The following processor values are incorrect: {}".format( incorrect)) if not os.path.exists(cnn_dm_file): raise FileNotFoundError( "The input file is not found. {} not found".format(cnn_dm_file)) pipeline = stanfordnlp.Pipeline(models_dir=models_dir, processors=processors) processed_file = "{}_processed.jsonl".format( os.path.splitext(cnn_dm_file)[0]) stanford_preprocess(pipeline, cnn_dm_file, processed_file) dependency_file = "dep.jsonl" word_file = "words.jsonl" pos_file = "pos.jsonl" cnn_process(processed_file, article_file, summary_file, dependency_file, word_file, pos_file, dependency_file[:-1], word_file[:-1], pos_file[:-1]) ratio = int(split_ratio) if split_ratio >= 1.0 else int(split_ratio * 100) train_test_split(article_file, output_train_files[0], output_test_files[0], ratio) train_test_split(summary_file, output_train_files[1], output_test_files[1], ratio)
def __init__(self, lang='en', config='default'): super().__init__() # Downloads the language models for the neural pipeline if never installed before if 'stanfordnlp_resources' not in listdir('./'): stanfordnlp.download('en') # Initialize pipeline self.nlp = stanfordnlp.Pipeline(**config_stanford_nlp[config]) self.name_save = 'stanfordnlp'
def main(): print(gen_info(1, "download resources analyse japanese text")) print(gen_info(1, "please answer Y for installation these resources")) stanfordnlp.download('ja') print(gen_info(1, "finish downloading")) print(gen_info(1, "testing dependency modules")) test_stanfordnlp(SAMPLE_TEXT) print(gen_info(1, "finish testing"))
def __init__(self, nlp=None, lang="tr"): if nlp: self.nlp = nlp else: try: self.nlp = stanfordnlp.Pipeline(lang=lang) except: stanfordnlp.download(lang) self.nlp = stanfordnlp.Pipeline(lang=lang)
def __init__(self): if not os.path.isdir(COM.STANFORD_NLP_RESOURCES_PATH): stanfordnlp.download('en', force = True, resource_dir = COM.STANFORD_NLP_RESOURCES_PATH) self.nlp = stanfordnlp.Pipeline(models_dir = COM.STANFORD_NLP_RESOURCES_PATH) glove_100000_word_list_df = pd.read_csv(COM.CSV_WORD_LIST_GLOVE_100000, header=None) self.__stemmer = SnowballStemmer("english") self.__list_glove_words = glove_100000_word_list_df[0].tolist()
def test(): stanfordnlp.download('en') sentence = "The argument is used to specify the task All five processors are taken by default if no argument is passed Here is a quick overview of the processors" # weights = load_model() nlp = stanfordnlp.Pipeline(processors="tokenize,depparse") f = nlp(sentence) m = 0
def main(data): nltk.download('stopwords') # Pick which PoS tags you want postag_title = 'Please POS tags (SPACE to mark, ENTER to continue)' postags = [ 'ADJ', 'ADP', 'PUNCT', 'ADV', 'AUX', 'SYM', 'INTJ', 'CCONJ', 'X', 'NOUN', 'DET', 'PROPN', 'NUM', 'VERB', 'PART', 'PRON', 'SCONJ' ] wanted_pos = pick(postags, postag_title, multi_select=True, min_selection_count=1) wanted_pos = [pos[0] for pos in wanted_pos] # Pick language lang_title = 'Please choose which language the text is in.' langs = ['en', 'da', 'other'] lang, lang_title = pick(langs, lang_title) if lang == 'other': lang = input('Please input language code \ (see stanfordnlp.github.io/stanfordnlp/models.html)') # Download model for nlp. if not os.path.exists( os.path.join(os.environ['HOME'], 'stanfordnlp_resources', f'{lang}_ddt_models')): stanfordnlp.download(lang) # Set up nlp pipeline nlp = stanfordnlp.Pipeline(processors='tokenize,mwt,lemma,pos', lang=lang) # pick column for terms column_title = 'Please chose which column contains the words.' columns = data.columns column, column_title = pick(columns, column_title) # For progress bar tqdm.pandas(desc="Tokenizing and POS-tagging...") data['tokens'] = data[column].progress_apply(lambda text: nlp(text)) data['lemmas'] = data['tokens'].apply(get_lemma) data['lemmas_string'] = data['lemmas'].apply(lambda x: " ".join(x)) data['without_stop'] = data['lemmas'].apply(remove_stop) data['filtered'] = data['tokens'].apply( lambda x: filter_pos(x, wanted_pos)) data['filtered'] = data['filtered'].apply(remove_punc) data['filtered'] = data['filtered'].apply(lambda x: ", ".join(x)) data.drop(['tokens', 'lemmas', 'lemmas_string', 'without_stop'], axis=1, inplace=True) return data
def tokenize(args): stanfordnlp_models_dir, lang = args.snlp_models_dir, args.lang raw_data_dir = args.raw_path tokenized_stories_dir = args.save_path if not os.path.isdir(stanfordnlp_models_dir): stanfordnlp.download(lang, resource_dir=stanfordnlp_models_dir, confirm_if_exists=True) # default "en" snlp = stanfordnlp.Pipeline(processors="tokenize", lang=lang, models_dir=stanfordnlp_models_dir) valid_file_count = 0 with tqdm(os.listdir(raw_data_dir)) as pbar: for fn in pbar: # check the file extension type if fn.split(".")[-1] != "story": continue else: valid_file_count += 1 fp = os.path.join(raw_data_dir, fn) with open(fp, "r") as f: data = f.read() doc = snlp(data) # preprocess sentences sentences = [] for i, sent in enumerate(doc.sentences): sent_ = {} sent_["index"] = i sent_["tokens"] = list( map( lambda token: { "index": int(token.index), "word": token.text, "originalText": token.text }, sent.tokens)) sentences.append(sent_) output = {} output["docId"] = fn.split('.')[0] output["sentences"] = sentences # save the tokenized ouput json file output_fp = os.path.join(tokenized_stories_dir, "{}.json".format(fn.split(".")[0])) with open(output_fp, "w") as f: json.dump(output, f) print("Finish tokenizing {} files in {} to {}.".format( valid_file_count, raw_data_dir, tokenized_stories_dir))
def test_english(): stanfordnlp.download('en') nlp = stanfordnlp.Pipeline() doc = nlp("I like small owls.") # shows # ('I', '2', 'nsubj') # ('like', '0', 'root') # ('small', '4', 'amod') # ('owls', '2', 'obj') # ('.', '2', 'punct') doc.sentences[0].print_dependencies()
def _stanfordnlp_download(language_package, resource_dir): from os.path import isdir from os import listdir found = False if isdir(resource_dir): files = listdir(resource_dir) filename_start = ''.join([language_package, "_"]) for file in files: if file.startswith(filename_start): found = True break if not found: stanfordnlp.download(language_package, resource_dir=resource_dir, confirm_if_exists=True, force=True)
def set_parser(self, language): language_model = { "en": "en_ewt", "it": "it_isdt", "de": "de_gsd", "tr": "tr_imst", "hr": "hr_set" } if not os.path.exists( os.path.join( os.path.expanduser("~"), "stanfordnlp_resources/{}_models".format( language_model[language]))): stanfordnlp.download(language, confirm_if_exists=True) self.nlp = stanfordnlp.Pipeline(lang=language)
def start_pipeline(): mfile = os.getenv("HOME") + \ '/stanfordnlp_resources/en_ewt_models' if not os.path.exists(mfile): stanfordnlp.download('en', confirm_if_exists=True, force=True) sout = sys.stdout serr = sys.stderr f = open(os.devnull, 'w') sys.stdout = f sys.stderr = f # turn output off - too noisy nlp = stanfordnlp.Pipeline() sys.stdout = sout sys.stderr = serr # turn output on again return nlp
def test_japanese(): stanfordnlp.download("ja") nlp = stanfordnlp.Pipeline(lang="ja") doc = nlp("にわにはにわにわとりがいます") # shows # ('に', '7', 'advmod') # ('わには', '7', 'advmod') # ('に', '2', 'case') # ('わに', '5', 'compound') # ('わとり', '7', 'nsubj') # ('が', '5', 'case') # ('い', '0', 'root') # ('ます', '7', 'aux') doc.sentences[0].print_dependencies()
def test_japanese2(): stanfordnlp.download("ja") nlp = stanfordnlp.Pipeline(lang="ja") doc = nlp("庭には二羽鶏がいます。") # shows # ('庭', '7', 'iobj') # ('に', '1', 'case') # ('は', '1', 'case') # ('二', '5', 'nummod') # ('羽鶏', '7', 'nsubj') # ('が', '5', 'case') # ('い', '0', 'root') # ('ます', '7', 'aux') # ('。', '7', 'punct') doc.sentences[0].print_dependencies()
def download_model(self): if self.lib.lower() == "stanford": print("-----------You are going to use Stanford library-----------") if self.lang.lower() == "basque": print("-------------You are going to use Basque model-------------") # MODELS_DIR = '/home/edercarbajo/eu' MODELS_DIR = 'J:\TextSimilarity\eu' stanfordnlp.download('eu', MODELS_DIR) # Download the Basque models # config = {'processors': 'tokenize,pos,lemma,depparse', # Comma-separated list of processors to use # 'lang': 'eu', # Language code for the language to build the Pipeline in # 'tokenize_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_tokenizer.pt', # # Processor-specific arguments are set with keys "{processor_name}_{argument_name}" # 'pos_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_tagger.pt', # 'pos_pretrain_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt.pretrain.pt', # 'lemma_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_lemmatizer.pt', # 'depparse_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_parser.pt', # 'depparse_pretrain_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt.pretrain.pt' # } config = {'processors': 'tokenize,pos,lemma,depparse', # Comma-separated list of processors to use 'lang': 'eu', # Language code for the language to build the Pipeline in 'tokenize_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tokenizer.pt', # Processor-specific arguments are set with keys "{processor_name}_{argument_name}" 'pos_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tagger.pt', 'pos_pretrain_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt', 'lemma_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_lemmatizer.pt', 'depparse_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_parser.pt', 'depparse_pretrain_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt' } self.parser = stanfordnlp.Pipeline(**config) else: print("............Working...........") elif self.lib.lower() == "cube": print("-----------You are going to use Cube Library-----------") if self.lang.lower() == "basque": cube = Cube(verbose=True) cube.load("eu", "latest") self.parser = cube else: print("............Working...........") else: print("You cannot use this library. Introduce a valid library (Cube or Stanford)")
def __init__(self, model='en'): """ Args: model (str): a spec for the spaCy model. (default: en). Please refer to the official website of spaCy for a complete list of the available models. This option is useful if you are dealing with languages other than English. """ self.model = model try: MODELS_DIR = '~/stanfordnlp_resources' import stanfordnlp stanfordnlp.download(model, treebank="en_ewt", resource_dir=MODELS_DIR, confirm_if_exists=True) except ImportError as e: raise ImportError( 'StanfordNLP backend requires the stanfordnlp library. Install spaCy via pip install stanfordnlp.' ) from e try: config = { 'processors': 'tokenize,pos,lemma,depparse', 'tokenize_pretokenized': True, 'models_dir': f'{MODELS_DIR}', 'treebank': 'en_ewt', 'pos_model_path': f'{MODELS_DIR}/en_ewt_models/en_ewt_tagger.pt', 'pos_pretrain_path': f'{MODELS_DIR}/en_ewt_models/en_ewt.pretrain.pt', 'pos_batch_size': 1000 } self.nlp = stanfordnlp.Pipeline(**config) except OSError as e: raise ImportError( 'Unable to load the English model. Run `stanfordnlp.download(model, MODELS_DIR)` first.' ) from e
def dl_missing_langs_snlp(langs, stanfordnlp_path): """ downloads any missing languages from stanford NLP resources Examples: >>> dl_missing_langs_snlp(langs = "da", stanfordnlp_path = os.getcwd() + "/stanfordnlp_resources") """ import stanfordnlp if isinstance(langs, str): langs = [langs] if not os.path.exists(stanfordnlp_path): os.makedirs(stanfordnlp_path) dl_langs = [folder[:2] for folder in os.listdir(stanfordnlp_path)] for lang in langs: if lang not in dl_langs: stanfordnlp.download(lang, resource_dir=stanfordnlp_path, force=True)
def download_model(self): if self.lib.lower() == "stanford": print( "-----------You are going to use Stanford library-----------") if self.lang.lower() == "basque": print( "-------------You are going to use Basque model-------------" ) # MODELS_DIR = '/home/kepa/eu' MODELS_DIR = 'J:\TextSimilarity\eu' stanfordnlp.download('eu', MODELS_DIR) # Download the Basque models elif self.lang.lower() == "english": print( "-------------You are going to use English model-------------" ) MODELS_DIR = '/home/kepa/en' print( "-------------Downloading Stanford Basque model-------------" ) stanfordnlp.download('en', MODELS_DIR) # Download the Basque models elif self.lang.lower() == "spanish": print( "-------------You are going to use Spanish model-------------" ) MODELS_DIR = '/home/kepa/es' stanfordnlp.download('es', MODELS_DIR) # Download the English models else: print("........You cannot use this language...........") elif self.lib.lower() == "cube": print("-----------You are going to use Cube Library-----------") if self.lang.lower() == "basque": cube = Cube(verbose=True) cube.load("eu", "latest") elif self.lang.lower() == "english": cube = Cube(verbose=True) cube.load("en", "latest") elif self.lang.lower() == "spanish": cube = Cube(verbose=True) cube.load("es", "latest") else: print("........You cannot use this language...........") else: print( "You cannot use this library. Introduce a valid library (Cube or Stanford)" )
def set_up(self): stanfordnlp.download(self.lang, self.MODELS_DIR)
import stanfordnlp stanfordnlp.download('en')
parser.add_argument('-l', '--lang', help='Demo language', default="en") parser.add_argument('-c', '--cpu', action='store_true', help='Use cpu as the device.') args = parser.parse_args() example_sentences = {"en": "Barack Obama was born in Hawaii. He was elected president in 2008.", "zh": "達沃斯世界經濟論壇是每年全球政商界領袖聚在一起的年度盛事。", "fr": "Van Gogh grandit au sein d'une famille de l'ancienne bourgeoisie. Il tente d'abord de faire carrière comme marchand d'art chez Goupil & C.", "vi": "Trận Trân Châu Cảng (hay Chiến dịch Hawaii theo cách gọi của Bộ Tổng tư lệnh Đế quốc Nhật Bản) là một đòn tấn công quân sự bất ngờ được Hải quân Nhật Bản thực hiện nhằm vào căn cứ hải quân của Hoa Kỳ tại Trân Châu Cảng thuộc tiểu bang Hawaii vào sáng Chủ Nhật, ngày 7 tháng 12 năm 1941, dẫn đến việc Hoa Kỳ sau đó quyết định tham gia vào hoạt động quân sự trong Chiến tranh thế giới thứ hai."} if args.lang not in example_sentences: print(f'Sorry, but we don\'t have a demo sentence for "{args.lang}" for the moment. Try one of these languages: {list(example_sentences.keys())}') exit() # download the models stanfordnlp.download(args.lang, args.models_dir, confirm_if_exists=True) # set up a pipeline print('---') print('Building pipeline...') pipeline = stanfordnlp.Pipeline(models_dir=args.models_dir, lang=args.lang, use_gpu=(not args.cpu)) # process the document doc = pipeline(example_sentences[args.lang]) # access nlp annotations print('') print('Input: {}'.format(example_sentences[args.lang])) print("The tokenizer split the input into {} sentences.".format(len(doc.sentences))) print('---') print('tokens of first sentence: ') doc.sentences[0].print_tokens() print('') print('---')
default='en') parser.add_argument( '-p', '--processors', help= 'list of processors to run | default: "tokenize,mwt,pos,lemma,depparse"', default='tokenize,mwt,pos,lemma,depparse') parser.add_argument('text_file') args = parser.parse_args() # set output file path output_file_path = args.text_file + '.out' # map language code to treebank shorthand treebank_shorthand = default_treebanks[args.language] # check for models print('checking for models...') lang_models_dir = '%s/%s_models' % (args.models_dir, treebank_shorthand) if not os.path.exists(lang_models_dir): print('could not find: ' + lang_models_dir) download(args.language, resource_dir=args.models_dir) # set up pipeline pipeline = Pipeline(processors=args.processors, lang=args.language, models_dir=args.models_dir) # build document print('running pipeline...') doc = pipeline(open(args.text_file).read()) # write conll to file doc.write_conll_to_file(output_file_path) print('done.') print('results written to: ' + output_file_path)
parser.add_argument('--' + processor_setting, action='store_true', default=None, help=argparse.SUPPRESS) else: parser.add_argument('--' + processor_setting, help=argparse.SUPPRESS) parser.add_argument('text_file') args = parser.parse_args() # set output file path if args.output is None: output_file_path = args.text_file+'.out' else: output_file_path = args.output # map language code to treebank shorthand treebank_shorthand = default_treebanks[args.language] # check for models print('checking for models...') lang_models_dir = '%s/%s_models' % (args.models_dir, treebank_shorthand) if not os.path.exists(lang_models_dir): print('could not find: '+lang_models_dir) download(args.language, resource_dir=args.models_dir, force=args.force_download) # set up pipeline pipeline_config = \ dict([(k, v) for k, v in vars(args).items() if k in PROCESSOR_SETTINGS_LIST and v is not None]) pipeline = Pipeline(processors=args.processors, lang=args.language, models_dir=args.models_dir, **pipeline_config) # build document print('running pipeline...') doc = pipeline(open(args.text_file).read()) # write conll to file doc.write_conll_to_file(output_file_path) print('done.') print('results written to: '+output_file_path)
parsed_text['lemma'].append(wrd.lemma) return parsed_text # le o texto e converte para lower cased texto = str(input("Entre com um texto: ")).lower() texto = texto.replace('\n', ' ').replace('\t', ' ').replace(',', ' ').replace('.', ' ').split(' ') contador = Counter(texto) print("Contando ", contador.items()) stanfordnlp.download('pt') nlp = stanfordnlp.Pipeline() reducida = "" for i in contador: reducida += " " + i doc = nlp(reducida) doc = extrac_lemma(doc) print(doc) nltk.download('stopwords') stopwords = set(stopwords.words('portuguese')) t = doc['lemma']
import stanfordnlp from graphviz import Digraph stanfordnlp.download('ja') nlp = stanfordnlp.Pipeline(processors= 'tokenize,mwt,pos,lemma,depparse', lang='ja') # ALL five processors are default. doc = nlp('一年ほど前、私は何人かと共にインドに向けて文学作品を放送する事業に携わっていた。種々のものをとりあげた中で、かなりの部分が現代ないしそれに近い時代の英国作家の韻文だった――例えばエリオット、ハーバート・リード、オーデン、スペンダー、ディラン・トーマス、ヘンリー・トリース、アレックス・コンフォート、ロバート・ブリッジズ、エドムンド・ブルンデン、D・H・ローレンス。詩の実作者に参加してもらえる場合はいつでもそうしていた。何故にこういう特殊な番組(ラジオ戦争における遠方からのささやかな側面攻撃だ)が始められることになったかは改めて説明するまでもないが、インド人聴衆に向けた放送である、という事実によって、我々の技法がある程度まで規定されていたという点には触れる必要があるだろう。要点はこうだ。我々の文芸番組はインド大学の学生たちをターゲットにしていた。彼らは少数かつ敵対的な聴衆で、英国のプロパガンダと表現しうるものは一つとして届かなかった。あらかじめ、聴取者は多めに見積もっても数千人を越すことはないだろうということがわかっていた。これが通常オンエアできる範囲を超えて「ハイブロウ」な番組を作るための口実になったのだ。') def dependency_visualized(doc): i = 0 # to name file, including dependency in a sentence for sent in doc.sentences: dot = Digraph(format='png', filename=f'test/graphs{i}') dot.attr('node', shape='square', style='filled', fontname="IPAGothic") for wrd in sent.dependencies: if wrd[0].text != 'ROOT': dot.edge(f'{wrd[0].text} {wrd[0].index}', f'{wrd[2].text} {wrd[2].index}', fontname="IPAGothic") # specify index because there is the same word(like 'で、。') but with different indexes else: pass dot.render() i += 1 dependency_visualized(doc)
import stanfordnlp from anytree import Node, RenderTree, NodeMixin stanfordnlp.download('ru', 'stanfordnlp_resources') nlp = stanfordnlp.Pipeline(lang='ru', models_dir='stanfordnlp_resources') features = [ 'index', 'text', 'lemma', 'upos', 'xpos', 'feats', 'governor', 'dependency_relation' ] def stanford_print_parse(sentence): # Parses the sentence and outputs CONLL parse doc = nlp(sentence) return "\n".join([ "\t".join([ "{}".format(getattr(w, k)) for k in features if getattr(w, k) is not None ]) for w in doc.sentences[0].words ]) class Token(object): pass class TokenNode(Token, NodeMixin): def __init__(self, name, text,
"達沃斯世界經濟論壇是每年全球政商界領袖聚在一起的年度盛事。", "fr": "Van Gogh grandit au sein d'une famille de l'ancienne bourgeoisie. Il tente d'abord de faire carrière comme marchand d'art chez Goupil & C.", "vi": "Trận Trân Châu Cảng (hay Chiến dịch Hawaii theo cách gọi của Bộ Tổng tư lệnh Đế quốc Nhật Bản) là một đòn tấn công quân sự bất ngờ được Hải quân Nhật Bản thực hiện nhằm vào căn cứ hải quân của Hoa Kỳ tại Trân Châu Cảng thuộc tiểu bang Hawaii vào sáng Chủ Nhật, ngày 7 tháng 12 năm 1941, dẫn đến việc Hoa Kỳ sau đó quyết định tham gia vào hoạt động quân sự trong Chiến tranh thế giới thứ hai." } if args.lang not in example_sentences: print( f'Sorry, but we don\'t have a demo sentence for "{args.lang}" for the moment. Try one of these languages: {list(example_sentences.keys())}' ) exit() # download the models stanfordnlp.download(args.lang, args.models_dir, force_download=False, confirm_if_exists=False) # set up a pipeline print('---') print('Building pipeline...') pipeline = stanfordnlp.Pipeline(models_dir=args.models_dir, lang=args.lang, use_gpu=(not args.cpu)) # process the document doc = pipeline(example_sentences[args.lang]) # access nlp annotations print('') print('Input: {}'.format(example_sentences[args.lang])) print("The tokenizer split the input into {} sentences.".format( len(doc.sentences))) print('---')
import stanfordnlp import time root = '/root' lang = 'en' text = "You, you love it how I move you\nYou love it how I touch you\nMy one, when all is said and done\nYou'll believe God is a woman\n\nAnd I, I feel it after midnight\nA feeling that you can't fight\nMy one, it lingers when we're done\nYou'll believe God is a woman\n\nI don't wanna waste no time, yuh\nYou ain't got a one-track mind, yuh\nHave it any way you like, yuh\nAnd I can tell that you know\nI know how I want it\n\nAin't nobody else can relate\nBoy I like that you ain't afraid\nBaby lay me down and let's pray\nI'm telling you the way I like it\nHow I want it\n\nYuh\nAnd I can be all of things you tell me not to be, yuh\nWhen you try to come for me I keep on flourishing, yuh\nAnd he see the universe when I'm in company, uh\nIt's all in me\n\nYou, you love it how I move you\nYou love it how I touch you\nMy one, when all is said and done\nYou'll believe God is a woman\n\nAnd I, I feel it after midnight\nA feeling that you can't fight\nMy one, it lingers when we're done\nYou'll believe God is a woman\n\nI tell you all the things you should know\nSo baby take my hands, save your soul\nWe can make it last, take it slow\nAnd I can tell that you know\nI know how I want it\n\nBut you different from the rest\nAnd boy if you confess you might get blessed\nSee if you deserve what comes next\nI'm telling you the way I like it\nHow I want it\n\nYuh\nAnd I can be all of things you tell me not to be, yuh\nWhen you try to come for me I keep on flourishing, yuh\nAnd he see the universe when I'm in company\nIt's all in me\n\nYou, you love it how I move you\nYou love it how I touch you\nMy one, when all is said and done\nYou'll believe God is a woman\n\nAnd I, I feel it after midnight\nA feeling that you can't fight\nMy one, it lingers when we're done\nYou'll believe God is a woman, yeah yeah\n\nGod is a woman, yeah yeah\nGod is a woman\nMy one (one)\nWhen all is said and done\nYou'll believe God is a woman\n\n(You'll believe God)\nGod is a woman (oh, yeah)\nGod is a woman, yeah\n(One) It lingers when we're done\nYou'll believe God is a woman" processors = "tokenize,mwt,pos,lemma" #tokenize,mwt,pos,lemma,depparse stanfordnlp.download(lang, resource_dir=root, should_download=True, confirm_if_exists=False) start_time = time.time() pipeline = stanfordnlp.Pipeline( lang=lang, models_dir=root, use_gpu=False, processors=processors) # This sets up a default neural pipeline in English elapsed_time = time.time() - start_time print("loaded in:%f" % elapsed_time) start_time = time.time() doc = pipeline(text) elapsed_time = time.time() - start_time print("parsed in:%f" % elapsed_time) for sentence in doc.sentences: sentence.print_dependencies() #sentence.print_tokens() for token in sentence.tokens: word = token.words[0] print("Index:%s word:%s lemma:%s" %
import stanfordnlp import torch from transformers import * stanfordnlp.download( 'en') # download english models that the neural pipeline will uses nlp = stanfordnlp.Pipeline() # setting a default neural pipeline for english # Now nlp is a function that receives a string as input and returns an nlp object # create tokenizer pretrained_weights = 'scibert-scivocab-uncased' tokenizer_class = BertTokenizer tokenizer = tokenizer_class.from_pretrained(pretrained_weights) inp_string = "The data for our first experiment is a corpus of parsed sentences from the Penn Treebank" inp_tokens = tokenizer.encode(inp_string) sequence = nlp(inp_string) sequence.sentences[0].print_dependencies() #print(f'Sentence length is: {len(sequence)}') #print(f'Tokenized length is: {len(inp_tokens)}')
g.add_node(noun, **node_attributes) # Add edges combos = list(itertools.combinations(nouns, r=2)) for i, combo in enumerate(combos): combo = tuple(sorted(list(combo))) if combo in WEIGHTS.keys(): WEIGHTS[combo] += 1 attr = {'weight': WEIGHTS[combo]} g.change_edge(id=f'{combo[0]}-{combo[1]}', **attr) else: WEIGHTS[combo] = 1 attr = {'directed': False, 'weight': WEIGHTS[combo]} g.add_edge(id=f'{combo[0]}-{combo[1]}', source=combo[0], target=combo[1], **attr) # Download model for nlp stanford_path = Path.home() / 'stanfordnlp_resources' / f'da_ddt_models' if not stanford_path.exists(): stanfordnlp.download('da') # Set up Gephi client g = pygephi.GephiClient('http://localhost:8080/workspace1', autoflush=True) g.clean() # Keep track of edgelist (for weights) WEIGHTS = dict()
import pandas as pd from igannotator.annotator import StanfordAnnotator import stanfordnlp import os import re RESOURCES_DIR = "resources" stanfordnlp.download("pl", resource_dir=RESOURCES_DIR, confirm_if_exists=False, force=True) annotator = StanfordAnnotator(RESOURCES_DIR) directory = "./data/conllu/goldStandard-stanford" if not os.path.exists(directory): os.makedirs(directory) with open("data/nauka_1.txt", "r+", encoding="utf8") as input_file: content = input_file.read() lines = [line for line in content.split("\n\n") if not line.startswith("--")] for line in lines: line_regex = re.compile("^([0-9]*)\\. ((?s).*)$") regex_result = line_regex.search(line) number = regex_result.group(1) text = regex_result.group(2) print(text) try: dfs = annotator.annotate(text) output_df = pd.DataFrame()
def download_stanfordnlp(): import stanfordnlp stanfordnlp.download('en')
parser.add_argument( '--output_path', type=str, default='tsv_features', help= 'Path to folder that contain final tsv feature files used for classification' ) parser.add_argument('--test', action='store_true', help='Parse test set with no target class') args = parser.parse_args() if not os.path.exists(args.output_path): os.makedirs(args.output_path) stanfordnlp.download( 'en') # This downloads the English models for the neural pipeline if args.test: parse_test_transcripts( args.input_path, os.path.join(args.output_path, 'text_features_test.txt')) build_test_csv( os.path.join(args.output_path, 'text_features_test.txt'), os.path.join(args.output_path, 'text_features_test.tsv')) os.remove(os.path.join(args.output_path, 'text_features_test.txt')) else: parse_transcripts( args.input_path, os.path.join(args.output_path, 'text_features_train.txt')) build_csv(os.path.join(args.output_path, 'text_features_train.txt'), os.path.join(args.output_path, 'text_features_train.tsv'))