Example #1
0
def preprocess(models_dir, processors, extractive, cnn_dm_file, article_file,
               summary_file, output_train_files, output_test_files,
               split_ratio):
    """
    This function goes through the entire preprocessing pipeline resulting in files ready to be used as the input of
    the graph neural network.
    :param models_dir: The path to the stanfordnlp directory.
    :param processors: List of parsers to use. Options: tokenize, mwt, pos, lemma, depparse.
    :param extractive: Whether to make extractive summary or not. If true,the input file should contain the best ids.
    :param cnn_dm_file: This file should contain the articles and the summaries in a jsonl format.
    :param article_file: This file will contain every article graph.
    :param summary_file: This file will contain every summary graph.
    :param output_train_files: The paths to save the training files.
                               The first parameter is the training input, the second is the expected output.
    :param output_test_files: The paths to save the validation files.
                              The first parameter is the validation input, the second is the expected output.
    :param split_ratio: The ratio of data used for training vs validation.
    :return: None
    """
    from graph_transformations.preprocessor import main as stanford_preprocess
    if extractive:
        from graph_transformations.cnn_extractive_parser import main as cnn_process
    else:
        from graph_transformations.cnn_parser import main as cnn_process
    from graph_transformations.train_test_split import train_test_split
    import stanfordnlp

    if not os.path.exists(models_dir):
        stanfordnlp.download('en', resource_dir=models_dir)
    correct_processors = ["tokenize", "mwt", "pos", "lemma", "depparse"]

    incorrect = [i for i in processors if i not in correct_processors]
    if len(incorrect) != 0:
        raise ValueError(
            "The following processor values are incorrect: {}".format(
                incorrect))

    if not os.path.exists(cnn_dm_file):
        raise FileNotFoundError(
            "The input file is not found. {} not found".format(cnn_dm_file))

    pipeline = stanfordnlp.Pipeline(models_dir=models_dir,
                                    processors=processors)
    processed_file = "{}_processed.jsonl".format(
        os.path.splitext(cnn_dm_file)[0])
    stanford_preprocess(pipeline, cnn_dm_file, processed_file)

    dependency_file = "dep.jsonl"
    word_file = "words.jsonl"
    pos_file = "pos.jsonl"
    cnn_process(processed_file, article_file, summary_file, dependency_file,
                word_file, pos_file, dependency_file[:-1], word_file[:-1],
                pos_file[:-1])

    ratio = int(split_ratio) if split_ratio >= 1.0 else int(split_ratio * 100)
    train_test_split(article_file, output_train_files[0], output_test_files[0],
                     ratio)
    train_test_split(summary_file, output_train_files[1], output_test_files[1],
                     ratio)
 def __init__(self, lang='en', config='default'):
     super().__init__()
     # Downloads the language models for the neural pipeline if never installed before
     if 'stanfordnlp_resources' not in listdir('./'):
         stanfordnlp.download('en')
     # Initialize pipeline
     self.nlp = stanfordnlp.Pipeline(**config_stanford_nlp[config])
     self.name_save = 'stanfordnlp'
Example #3
0
def main():
    print(gen_info(1, "download resources analyse japanese text"))
    print(gen_info(1, "please answer Y for installation these resources"))
    stanfordnlp.download('ja')
    print(gen_info(1, "finish downloading"))
    print(gen_info(1, "testing dependency modules"))
    test_stanfordnlp(SAMPLE_TEXT)
    print(gen_info(1, "finish testing"))
 def __init__(self, nlp=None, lang="tr"):
     if nlp:
         self.nlp = nlp
     else:
         try:
             self.nlp = stanfordnlp.Pipeline(lang=lang)
         except:
             stanfordnlp.download(lang)
             self.nlp = stanfordnlp.Pipeline(lang=lang)
Example #5
0
 def __init__(self):
   if not os.path.isdir(COM.STANFORD_NLP_RESOURCES_PATH):
     stanfordnlp.download('en', force = True, resource_dir = COM.STANFORD_NLP_RESOURCES_PATH)
   
   self.nlp = stanfordnlp.Pipeline(models_dir = COM.STANFORD_NLP_RESOURCES_PATH)
   
   glove_100000_word_list_df = pd.read_csv(COM.CSV_WORD_LIST_GLOVE_100000, header=None)
   self.__stemmer = SnowballStemmer("english")
   self.__list_glove_words = glove_100000_word_list_df[0].tolist()
def test():
    stanfordnlp.download('en')

    sentence = "The argument is used to specify the task All five processors are taken by default if no argument is passed Here is a quick overview of the processors"
    # weights = load_model()

    nlp = stanfordnlp.Pipeline(processors="tokenize,depparse")
    f = nlp(sentence)
    m = 0
Example #7
0
def main(data):
    nltk.download('stopwords')

    # Pick which PoS tags you want
    postag_title = 'Please POS tags (SPACE to mark, ENTER to continue)'
    postags = [
        'ADJ', 'ADP', 'PUNCT', 'ADV', 'AUX', 'SYM', 'INTJ', 'CCONJ', 'X',
        'NOUN', 'DET', 'PROPN', 'NUM', 'VERB', 'PART', 'PRON', 'SCONJ'
    ]
    wanted_pos = pick(postags,
                      postag_title,
                      multi_select=True,
                      min_selection_count=1)
    wanted_pos = [pos[0] for pos in wanted_pos]

    # Pick language
    lang_title = 'Please choose which language the text is in.'
    langs = ['en', 'da', 'other']
    lang, lang_title = pick(langs, lang_title)
    if lang == 'other':
        lang = input('Please input language code \
	(see stanfordnlp.github.io/stanfordnlp/models.html)')

    # Download model for nlp.
    if not os.path.exists(
            os.path.join(os.environ['HOME'], 'stanfordnlp_resources',
                         f'{lang}_ddt_models')):
        stanfordnlp.download(lang)

    # Set up nlp pipeline
    nlp = stanfordnlp.Pipeline(processors='tokenize,mwt,lemma,pos', lang=lang)

    # pick column for terms
    column_title = 'Please chose which column contains the words.'
    columns = data.columns
    column, column_title = pick(columns, column_title)

    # For progress bar
    tqdm.pandas(desc="Tokenizing and POS-tagging...")

    data['tokens'] = data[column].progress_apply(lambda text: nlp(text))
    data['lemmas'] = data['tokens'].apply(get_lemma)
    data['lemmas_string'] = data['lemmas'].apply(lambda x: " ".join(x))
    data['without_stop'] = data['lemmas'].apply(remove_stop)
    data['filtered'] = data['tokens'].apply(
        lambda x: filter_pos(x, wanted_pos))
    data['filtered'] = data['filtered'].apply(remove_punc)
    data['filtered'] = data['filtered'].apply(lambda x: ", ".join(x))
    data.drop(['tokens', 'lemmas', 'lemmas_string', 'without_stop'],
              axis=1,
              inplace=True)

    return data
Example #8
0
def tokenize(args):
    stanfordnlp_models_dir, lang = args.snlp_models_dir, args.lang
    raw_data_dir = args.raw_path
    tokenized_stories_dir = args.save_path

    if not os.path.isdir(stanfordnlp_models_dir):
        stanfordnlp.download(lang,
                             resource_dir=stanfordnlp_models_dir,
                             confirm_if_exists=True)  # default "en"

    snlp = stanfordnlp.Pipeline(processors="tokenize",
                                lang=lang,
                                models_dir=stanfordnlp_models_dir)

    valid_file_count = 0
    with tqdm(os.listdir(raw_data_dir)) as pbar:
        for fn in pbar:
            # check the file extension type
            if fn.split(".")[-1] != "story": continue
            else: valid_file_count += 1

            fp = os.path.join(raw_data_dir, fn)
            with open(fp, "r") as f:
                data = f.read()
                doc = snlp(data)

            # preprocess sentences
            sentences = []
            for i, sent in enumerate(doc.sentences):
                sent_ = {}
                sent_["index"] = i
                sent_["tokens"] = list(
                    map(
                        lambda token: {
                            "index": int(token.index),
                            "word": token.text,
                            "originalText": token.text
                        }, sent.tokens))
                sentences.append(sent_)

            output = {}
            output["docId"] = fn.split('.')[0]
            output["sentences"] = sentences

            # save the tokenized ouput json file
            output_fp = os.path.join(tokenized_stories_dir,
                                     "{}.json".format(fn.split(".")[0]))
            with open(output_fp, "w") as f:
                json.dump(output, f)

    print("Finish tokenizing {} files in {} to {}.".format(
        valid_file_count, raw_data_dir, tokenized_stories_dir))
Example #9
0
def test_english():

	stanfordnlp.download('en')
	nlp = stanfordnlp.Pipeline()
	doc = nlp("I like small owls.")

	# shows
	# ('I', '2', 'nsubj')
	# ('like', '0', 'root')
	# ('small', '4', 'amod')
	# ('owls', '2', 'obj')
	# ('.', '2', 'punct')
	doc.sentences[0].print_dependencies()
Example #10
0
 def _stanfordnlp_download(language_package, resource_dir):
     from os.path import isdir
     from os import listdir
     found = False
     if isdir(resource_dir):
         files = listdir(resource_dir)
         filename_start = ''.join([language_package, "_"])
         for file in files:
             if file.startswith(filename_start):
                 found = True
                 break
     if not found:
         stanfordnlp.download(language_package, resource_dir=resource_dir, confirm_if_exists=True, force=True)
Example #11
0
 def set_parser(self, language):
     language_model = {
         "en": "en_ewt",
         "it": "it_isdt",
         "de": "de_gsd",
         "tr": "tr_imst",
         "hr": "hr_set"
     }
     if not os.path.exists(
             os.path.join(
                 os.path.expanduser("~"),
                 "stanfordnlp_resources/{}_models".format(
                     language_model[language]))):
         stanfordnlp.download(language, confirm_if_exists=True)
     self.nlp = stanfordnlp.Pipeline(lang=language)
Example #12
0
 def start_pipeline():
     mfile = os.getenv("HOME") + \
         '/stanfordnlp_resources/en_ewt_models'
     if not os.path.exists(mfile):
         stanfordnlp.download('en', confirm_if_exists=True, force=True)
     sout = sys.stdout
     serr = sys.stderr
     f = open(os.devnull, 'w')
     sys.stdout = f
     sys.stderr = f
     # turn output off - too noisy
     nlp = stanfordnlp.Pipeline()
     sys.stdout = sout
     sys.stderr = serr
     # turn output on again
     return nlp
Example #13
0
def test_japanese():

	stanfordnlp.download("ja")
	nlp = stanfordnlp.Pipeline(lang="ja")
	doc = nlp("にわにはにわにわとりがいます")

	# shows
	# ('に', '7', 'advmod')
	# ('わには', '7', 'advmod')
	# ('に', '2', 'case')
	# ('わに', '5', 'compound')
	# ('わとり', '7', 'nsubj')
	# ('が', '5', 'case')
	# ('い', '0', 'root')
	# ('ます', '7', 'aux')
	doc.sentences[0].print_dependencies()
Example #14
0
def test_japanese2():

	stanfordnlp.download("ja")
	nlp = stanfordnlp.Pipeline(lang="ja")
	doc = nlp("庭には二羽鶏がいます。")

	# shows
	# ('庭', '7', 'iobj')
	# ('に', '1', 'case')
	# ('は', '1', 'case')
	# ('二', '5', 'nummod')
	# ('羽鶏', '7', 'nsubj')
	# ('が', '5', 'case')
	# ('い', '0', 'root')
	# ('ます', '7', 'aux')
	# ('。', '7', 'punct')
	doc.sentences[0].print_dependencies()
Example #15
0
 def download_model(self):
     if self.lib.lower() == "stanford":
         print("-----------You are going to use Stanford library-----------")
         if self.lang.lower() == "basque":
             print("-------------You are going to use Basque model-------------")
             # MODELS_DIR = '/home/edercarbajo/eu'
             MODELS_DIR = 'J:\TextSimilarity\eu'
             stanfordnlp.download('eu', MODELS_DIR)  # Download the Basque models
             # config = {'processors': 'tokenize,pos,lemma,depparse',  # Comma-separated list of processors to use
             #           'lang': 'eu',  # Language code for the language to build the Pipeline in
             #           'tokenize_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_tokenizer.pt',
             #           # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
             #           'pos_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_tagger.pt',
             #           'pos_pretrain_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt.pretrain.pt',
             #           'lemma_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_lemmatizer.pt',
             #           'depparse_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_parser.pt',
             #           'depparse_pretrain_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt.pretrain.pt'
             #           }
             config = {'processors': 'tokenize,pos,lemma,depparse',  # Comma-separated list of processors to use
                       'lang': 'eu',  # Language code for the language to build the Pipeline in
                       'tokenize_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tokenizer.pt',
                       # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
                       'pos_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tagger.pt',
                       'pos_pretrain_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt',
                       'lemma_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_lemmatizer.pt',
                       'depparse_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_parser.pt',
                       'depparse_pretrain_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt'
                       }
             self.parser = stanfordnlp.Pipeline(**config)
         else:
             print("............Working...........")
     elif self.lib.lower() == "cube":
         print("-----------You are going to use Cube Library-----------")
         if self.lang.lower() == "basque":
             cube = Cube(verbose=True)
             cube.load("eu", "latest")
             self.parser = cube
         else:
             print("............Working...........")
     else:
         print("You cannot use this library. Introduce a valid library (Cube or Stanford)")
Example #16
0
    def __init__(self, model='en'):
        """
        Args:
            model (str): a spec for the spaCy model. (default: en). Please refer to the
            official website of spaCy for a complete list of the available models.
            This option is useful if you are dealing with languages other than English.
        """

        self.model = model

        try:
            MODELS_DIR = '~/stanfordnlp_resources'
            import stanfordnlp
            stanfordnlp.download(model,
                                 treebank="en_ewt",
                                 resource_dir=MODELS_DIR,
                                 confirm_if_exists=True)
        except ImportError as e:
            raise ImportError(
                'StanfordNLP backend requires the stanfordnlp library. Install spaCy via pip install stanfordnlp.'
            ) from e

        try:
            config = {
                'processors': 'tokenize,pos,lemma,depparse',
                'tokenize_pretokenized': True,
                'models_dir': f'{MODELS_DIR}',
                'treebank': 'en_ewt',
                'pos_model_path':
                f'{MODELS_DIR}/en_ewt_models/en_ewt_tagger.pt',
                'pos_pretrain_path':
                f'{MODELS_DIR}/en_ewt_models/en_ewt.pretrain.pt',
                'pos_batch_size': 1000
            }
            self.nlp = stanfordnlp.Pipeline(**config)
        except OSError as e:
            raise ImportError(
                'Unable to load the English model. Run `stanfordnlp.download(model, MODELS_DIR)` first.'
            ) from e
def dl_missing_langs_snlp(langs, stanfordnlp_path):
    """
    downloads any missing languages from stanford NLP resources


    Examples:
    >>> dl_missing_langs_snlp(langs = "da", stanfordnlp_path = os.getcwd() + "/stanfordnlp_resources")
    """
    import stanfordnlp

    if isinstance(langs, str):
        langs = [langs]

    if not os.path.exists(stanfordnlp_path):
        os.makedirs(stanfordnlp_path)

    dl_langs = [folder[:2] for folder in os.listdir(stanfordnlp_path)]
    for lang in langs:
        if lang not in dl_langs:
            stanfordnlp.download(lang,
                                 resource_dir=stanfordnlp_path,
                                 force=True)
Example #18
0
 def download_model(self):
     if self.lib.lower() == "stanford":
         print(
             "-----------You are going to use Stanford library-----------")
         if self.lang.lower() == "basque":
             print(
                 "-------------You are going to use Basque model-------------"
             )
             # MODELS_DIR = '/home/kepa/eu'
             MODELS_DIR = 'J:\TextSimilarity\eu'
             stanfordnlp.download('eu',
                                  MODELS_DIR)  # Download the Basque models
         elif self.lang.lower() == "english":
             print(
                 "-------------You are going to use English model-------------"
             )
             MODELS_DIR = '/home/kepa/en'
             print(
                 "-------------Downloading Stanford Basque model-------------"
             )
             stanfordnlp.download('en',
                                  MODELS_DIR)  # Download the Basque models
         elif self.lang.lower() == "spanish":
             print(
                 "-------------You are going to use Spanish model-------------"
             )
             MODELS_DIR = '/home/kepa/es'
             stanfordnlp.download('es',
                                  MODELS_DIR)  # Download the English models
         else:
             print("........You cannot use this language...........")
     elif self.lib.lower() == "cube":
         print("-----------You are going to use Cube Library-----------")
         if self.lang.lower() == "basque":
             cube = Cube(verbose=True)
             cube.load("eu", "latest")
         elif self.lang.lower() == "english":
             cube = Cube(verbose=True)
             cube.load("en", "latest")
         elif self.lang.lower() == "spanish":
             cube = Cube(verbose=True)
             cube.load("es", "latest")
         else:
             print("........You cannot use this language...........")
     else:
         print(
             "You cannot use this library. Introduce a valid library (Cube or Stanford)"
         )
Example #19
0
 def set_up(self):
     stanfordnlp.download(self.lang, self.MODELS_DIR)
Example #20
0
import stanfordnlp

stanfordnlp.download('en')
Example #21
0
    parser.add_argument('-l', '--lang', help='Demo language',
                        default="en")
    parser.add_argument('-c', '--cpu', action='store_true', help='Use cpu as the device.')
    args = parser.parse_args()

    example_sentences = {"en": "Barack Obama was born in Hawaii.  He was elected president in 2008.",
            "zh": "達沃斯世界經濟論壇是每年全球政商界領袖聚在一起的年度盛事。",
            "fr": "Van Gogh grandit au sein d'une famille de l'ancienne bourgeoisie. Il tente d'abord de faire carrière comme marchand d'art chez Goupil & C.",
            "vi": "Trận Trân Châu Cảng (hay Chiến dịch Hawaii theo cách gọi của Bộ Tổng tư lệnh Đế quốc Nhật Bản) là một đòn tấn công quân sự bất ngờ được Hải quân Nhật Bản thực hiện nhằm vào căn cứ hải quân của Hoa Kỳ tại Trân Châu Cảng thuộc tiểu bang Hawaii vào sáng Chủ Nhật, ngày 7 tháng 12 năm 1941, dẫn đến việc Hoa Kỳ sau đó quyết định tham gia vào hoạt động quân sự trong Chiến tranh thế giới thứ hai."}

    if args.lang not in example_sentences:
        print(f'Sorry, but we don\'t have a demo sentence for "{args.lang}" for the moment. Try one of these languages: {list(example_sentences.keys())}')
        exit()

    # download the models
    stanfordnlp.download(args.lang, args.models_dir, confirm_if_exists=True)
    # set up a pipeline
    print('---')
    print('Building pipeline...')
    pipeline = stanfordnlp.Pipeline(models_dir=args.models_dir, lang=args.lang, use_gpu=(not args.cpu))
    # process the document
    doc = pipeline(example_sentences[args.lang])
    # access nlp annotations
    print('')
    print('Input: {}'.format(example_sentences[args.lang]))
    print("The tokenizer split the input into {} sentences.".format(len(doc.sentences)))
    print('---')
    print('tokens of first sentence: ')
    doc.sentences[0].print_tokens()
    print('')
    print('---')
Example #22
0
                        default='en')
    parser.add_argument(
        '-p',
        '--processors',
        help=
        'list of processors to run | default: "tokenize,mwt,pos,lemma,depparse"',
        default='tokenize,mwt,pos,lemma,depparse')
    parser.add_argument('text_file')
    args = parser.parse_args()
    # set output file path
    output_file_path = args.text_file + '.out'
    # map language code to treebank shorthand
    treebank_shorthand = default_treebanks[args.language]
    # check for models
    print('checking for models...')
    lang_models_dir = '%s/%s_models' % (args.models_dir, treebank_shorthand)
    if not os.path.exists(lang_models_dir):
        print('could not find: ' + lang_models_dir)
        download(args.language, resource_dir=args.models_dir)
    # set up pipeline
    pipeline = Pipeline(processors=args.processors,
                        lang=args.language,
                        models_dir=args.models_dir)
    # build document
    print('running pipeline...')
    doc = pipeline(open(args.text_file).read())
    # write conll to file
    doc.write_conll_to_file(output_file_path)
    print('done.')
    print('results written to: ' + output_file_path)
Example #23
0
            parser.add_argument('--' + processor_setting, action='store_true', default=None, help=argparse.SUPPRESS)
        else:
            parser.add_argument('--' + processor_setting, help=argparse.SUPPRESS)
    parser.add_argument('text_file')
    args = parser.parse_args()
    # set output file path
    if args.output is None:
        output_file_path = args.text_file+'.out'
    else:
        output_file_path = args.output
    # map language code to treebank shorthand
    treebank_shorthand = default_treebanks[args.language]
    # check for models
    print('checking for models...')
    lang_models_dir = '%s/%s_models' % (args.models_dir, treebank_shorthand)
    if not os.path.exists(lang_models_dir):
        print('could not find: '+lang_models_dir)
        download(args.language, resource_dir=args.models_dir, force=args.force_download)
    # set up pipeline
    pipeline_config = \
        dict([(k, v) for k, v in vars(args).items() if k in PROCESSOR_SETTINGS_LIST and v is not None])
    pipeline = Pipeline(processors=args.processors, lang=args.language, models_dir=args.models_dir, **pipeline_config)
    # build document
    print('running pipeline...')
    doc = pipeline(open(args.text_file).read())
    # write conll to file
    doc.write_conll_to_file(output_file_path)
    print('done.')
    print('results written to: '+output_file_path)

            parsed_text['lemma'].append(wrd.lemma)
    return parsed_text


# le o texto e converte para lower cased
texto = str(input("Entre com um texto: ")).lower()

texto = texto.replace('\n',
                      ' ').replace('\t',
                                   ' ').replace(',',
                                                ' ').replace('.',
                                                             ' ').split(' ')
contador = Counter(texto)
print("Contando ", contador.items())

stanfordnlp.download('pt')
nlp = stanfordnlp.Pipeline()

reducida = ""
for i in contador:
    reducida += " " + i

doc = nlp(reducida)
doc = extrac_lemma(doc)

print(doc)

nltk.download('stopwords')

stopwords = set(stopwords.words('portuguese'))
t = doc['lemma']
import stanfordnlp
from graphviz import Digraph


stanfordnlp.download('ja')
nlp = stanfordnlp.Pipeline(processors= 'tokenize,mwt,pos,lemma,depparse', lang='ja')
# ALL five processors are default.

doc = nlp('一年ほど前、私は何人かと共にインドに向けて文学作品を放送する事業に携わっていた。種々のものをとりあげた中で、かなりの部分が現代ないしそれに近い時代の英国作家の韻文だった――例えばエリオット、ハーバート・リード、オーデン、スペンダー、ディラン・トーマス、ヘンリー・トリース、アレックス・コンフォート、ロバート・ブリッジズ、エドムンド・ブルンデン、D・H・ローレンス。詩の実作者に参加してもらえる場合はいつでもそうしていた。何故にこういう特殊な番組(ラジオ戦争における遠方からのささやかな側面攻撃だ)が始められることになったかは改めて説明するまでもないが、インド人聴衆に向けた放送である、という事実によって、我々の技法がある程度まで規定されていたという点には触れる必要があるだろう。要点はこうだ。我々の文芸番組はインド大学の学生たちをターゲットにしていた。彼らは少数かつ敵対的な聴衆で、英国のプロパガンダと表現しうるものは一つとして届かなかった。あらかじめ、聴取者は多めに見積もっても数千人を越すことはないだろうということがわかっていた。これが通常オンエアできる範囲を超えて「ハイブロウ」な番組を作るための口実になったのだ。')

def dependency_visualized(doc):
  i = 0 # to name file, including dependency in a sentence
  for sent in doc.sentences:
    dot = Digraph(format='png', filename=f'test/graphs{i}')
    dot.attr('node', shape='square', style='filled', fontname="IPAGothic")
    for wrd in sent.dependencies:
      if wrd[0].text != 'ROOT':
        dot.edge(f'{wrd[0].text} {wrd[0].index}', f'{wrd[2].text} {wrd[2].index}', fontname="IPAGothic")
        # specify index because there is the same word(like 'で、。') but with different indexes
      else:
        pass
    dot.render()
    i += 1 

dependency_visualized(doc)
Example #26
0
import stanfordnlp
from anytree import Node, RenderTree, NodeMixin

stanfordnlp.download('ru', 'stanfordnlp_resources')
nlp = stanfordnlp.Pipeline(lang='ru', models_dir='stanfordnlp_resources')

features = [
    'index', 'text', 'lemma', 'upos', 'xpos', 'feats', 'governor',
    'dependency_relation'
]


def stanford_print_parse(sentence):
    # Parses the sentence and outputs CONLL parse
    doc = nlp(sentence)
    return "\n".join([
        "\t".join([
            "{}".format(getattr(w, k)) for k in features
            if getattr(w, k) is not None
        ]) for w in doc.sentences[0].words
    ])


class Token(object):
    pass


class TokenNode(Token, NodeMixin):
    def __init__(self,
                 name,
                 text,
Example #27
0
        "達沃斯世界經濟論壇是每年全球政商界領袖聚在一起的年度盛事。",
        "fr":
        "Van Gogh grandit au sein d'une famille de l'ancienne bourgeoisie. Il tente d'abord de faire carrière comme marchand d'art chez Goupil & C.",
        "vi":
        "Trận Trân Châu Cảng (hay Chiến dịch Hawaii theo cách gọi của Bộ Tổng tư lệnh Đế quốc Nhật Bản) là một đòn tấn công quân sự bất ngờ được Hải quân Nhật Bản thực hiện nhằm vào căn cứ hải quân của Hoa Kỳ tại Trân Châu Cảng thuộc tiểu bang Hawaii vào sáng Chủ Nhật, ngày 7 tháng 12 năm 1941, dẫn đến việc Hoa Kỳ sau đó quyết định tham gia vào hoạt động quân sự trong Chiến tranh thế giới thứ hai."
    }

    if args.lang not in example_sentences:
        print(
            f'Sorry, but we don\'t have a demo sentence for "{args.lang}" for the moment. Try one of these languages: {list(example_sentences.keys())}'
        )
        exit()

    # download the models
    stanfordnlp.download(args.lang,
                         args.models_dir,
                         force_download=False,
                         confirm_if_exists=False)
    # set up a pipeline
    print('---')
    print('Building pipeline...')
    pipeline = stanfordnlp.Pipeline(models_dir=args.models_dir,
                                    lang=args.lang,
                                    use_gpu=(not args.cpu))
    # process the document
    doc = pipeline(example_sentences[args.lang])
    # access nlp annotations
    print('')
    print('Input: {}'.format(example_sentences[args.lang]))
    print("The tokenizer split the input into {} sentences.".format(
        len(doc.sentences)))
    print('---')
Example #28
0
import stanfordnlp
import time

root = '/root'
lang = 'en'
text = "You, you love it how I move you\nYou love it how I touch you\nMy one, when all is said and done\nYou'll believe God is a woman\n\nAnd I, I feel it after midnight\nA feeling that you can't fight\nMy one, it lingers when we're done\nYou'll believe God is a woman\n\nI don't wanna waste no time, yuh\nYou ain't got a one-track mind, yuh\nHave it any way you like, yuh\nAnd I can tell that you know\nI know how I want it\n\nAin't nobody else can relate\nBoy I like that you ain't afraid\nBaby lay me down and let's pray\nI'm telling you the way I like it\nHow I want it\n\nYuh\nAnd I can be all of things you tell me not to be, yuh\nWhen you try to come for me I keep on flourishing, yuh\nAnd he see the universe when I'm in company, uh\nIt's all in me\n\nYou, you love it how I move you\nYou love it how I touch you\nMy one, when all is said and done\nYou'll believe God is a woman\n\nAnd I, I feel it after midnight\nA feeling that you can't fight\nMy one, it lingers when we're done\nYou'll believe God is a woman\n\nI tell you all the things you should know\nSo baby take my hands, save your soul\nWe can make it last, take it slow\nAnd I can tell that you know\nI know how I want it\n\nBut you different from the rest\nAnd boy if you confess you might get blessed\nSee if you deserve what comes next\nI'm telling you the way I like it\nHow I want it\n\nYuh\nAnd I can be all of things you tell me not to be, yuh\nWhen you try to come for me I keep on flourishing, yuh\nAnd he see the universe when I'm in company\nIt's all in me\n\nYou, you love it how I move you\nYou love it how I touch you\nMy one, when all is said and done\nYou'll believe God is a woman\n\nAnd I, I feel it after midnight\nA feeling that you can't fight\nMy one, it lingers when we're done\nYou'll believe God is a woman, yeah yeah\n\nGod is a woman, yeah yeah\nGod is a woman\nMy one (one)\nWhen all is said and done\nYou'll believe God is a woman\n\n(You'll believe God)\nGod is a woman (oh, yeah)\nGod is a woman, yeah\n(One) It lingers when we're done\nYou'll believe God is a woman"
processors = "tokenize,mwt,pos,lemma"  #tokenize,mwt,pos,lemma,depparse

stanfordnlp.download(lang,
                     resource_dir=root,
                     should_download=True,
                     confirm_if_exists=False)

start_time = time.time()
pipeline = stanfordnlp.Pipeline(
    lang=lang, models_dir=root, use_gpu=False,
    processors=processors)  # This sets up a default neural pipeline in English
elapsed_time = time.time() - start_time
print("loaded in:%f" % elapsed_time)

start_time = time.time()
doc = pipeline(text)
elapsed_time = time.time() - start_time
print("parsed in:%f" % elapsed_time)

for sentence in doc.sentences:
    sentence.print_dependencies()
    #sentence.print_tokens()
    for token in sentence.tokens:
        word = token.words[0]
        print("Index:%s word:%s lemma:%s" %
Example #29
0
import stanfordnlp
import torch
from transformers import *

stanfordnlp.download(
    'en')  # download english models that the neural pipeline will uses
nlp = stanfordnlp.Pipeline()  # setting a default neural pipeline for english
# Now nlp is a function that receives a string as input and returns an nlp object

# create tokenizer
pretrained_weights = 'scibert-scivocab-uncased'
tokenizer_class = BertTokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)

inp_string = "The data for our first experiment is a corpus of parsed sentences from the Penn Treebank"
inp_tokens = tokenizer.encode(inp_string)

sequence = nlp(inp_string)
sequence.sentences[0].print_dependencies()
#print(f'Sentence length is: {len(sequence)}')
#print(f'Tokenized length is: {len(inp_tokens)}')
Example #30
0
        g.add_node(noun, **node_attributes)

    # Add edges
    combos = list(itertools.combinations(nouns, r=2))
    for i, combo in enumerate(combos):
        combo = tuple(sorted(list(combo)))
        if combo in WEIGHTS.keys():
            WEIGHTS[combo] += 1
            attr = {'weight': WEIGHTS[combo]}
            g.change_edge(id=f'{combo[0]}-{combo[1]}', **attr)
        else:
            WEIGHTS[combo] = 1
            attr = {'directed': False, 'weight': WEIGHTS[combo]}
            g.add_edge(id=f'{combo[0]}-{combo[1]}',
                       source=combo[0],
                       target=combo[1],
                       **attr)


# Download model for nlp
stanford_path = Path.home() / 'stanfordnlp_resources' / f'da_ddt_models'
if not stanford_path.exists():
    stanfordnlp.download('da')

# Set up Gephi client
g = pygephi.GephiClient('http://localhost:8080/workspace1', autoflush=True)
g.clean()

# Keep track of edgelist (for weights)
WEIGHTS = dict()
Example #31
0
import pandas as pd
from igannotator.annotator import StanfordAnnotator
import stanfordnlp
import os
import re

RESOURCES_DIR = "resources"

stanfordnlp.download("pl",
                     resource_dir=RESOURCES_DIR,
                     confirm_if_exists=False,
                     force=True)
annotator = StanfordAnnotator(RESOURCES_DIR)

directory = "./data/conllu/goldStandard-stanford"
if not os.path.exists(directory):
    os.makedirs(directory)

with open("data/nauka_1.txt", "r+", encoding="utf8") as input_file:
    content = input_file.read()
lines = [line for line in content.split("\n\n") if not line.startswith("--")]
for line in lines:
    line_regex = re.compile("^([0-9]*)\\. ((?s).*)$")
    regex_result = line_regex.search(line)
    number = regex_result.group(1)
    text = regex_result.group(2)
    print(text)
    try:
        dfs = annotator.annotate(text)

        output_df = pd.DataFrame()
Example #32
0
def download_stanfordnlp():
    import stanfordnlp
    stanfordnlp.download('en')
    parser.add_argument(
        '--output_path',
        type=str,
        default='tsv_features',
        help=
        'Path to folder that contain final tsv feature files used for classification'
    )
    parser.add_argument('--test',
                        action='store_true',
                        help='Parse test set with no target class')
    args = parser.parse_args()

    if not os.path.exists(args.output_path):
        os.makedirs(args.output_path)

    stanfordnlp.download(
        'en')  # This downloads the English models for the neural pipeline

    if args.test:
        parse_test_transcripts(
            args.input_path,
            os.path.join(args.output_path, 'text_features_test.txt'))
        build_test_csv(
            os.path.join(args.output_path, 'text_features_test.txt'),
            os.path.join(args.output_path, 'text_features_test.tsv'))
        os.remove(os.path.join(args.output_path, 'text_features_test.txt'))
    else:
        parse_transcripts(
            args.input_path,
            os.path.join(args.output_path, 'text_features_train.txt'))
        build_csv(os.path.join(args.output_path, 'text_features_train.txt'),
                  os.path.join(args.output_path, 'text_features_train.tsv'))