Example #1
0
def pos_tagging(docs, stanford_path, pos_tagger):
    print("\nGenerating Part-of-Speech tags...")

    # Configuring Stanford NLP POS tagger
    path_to_model = "{}/models/{}.tagger".format(stanford_path, pos_tagger)
    path_to_jar = "{}/stanford-postagger.jar".format(stanford_path)

    tagger = StanfordPOSTagger(model_filename=path_to_model,
                               path_to_jar=path_to_jar)
    # Setting higher memory limit for long sentences
    tagger.java_options = '-mx8192m'

    data = []
    for doc in progressbar.progressbar(docs):
        # Obtain the list of tokens in the document
        tokens = [t for t, label in doc]

        try:
            # Perform POS tagging
            tagged = tagger.tag(tokens)
        except:
            continue

        # Take the word, POS tag, and its label
        data.append([(w, pos, label)
                     for (w, label), (word, pos) in zip(doc, tagged)])
    return data
def get_tagger():
    ''' Set up & return the Stanford Tagger object.'''
    path_to_model = "/home/avery/Applications/stanford-postagger-2018-02-27/models/english-bidirectional-distsim.tagger"
    path_to_jar = "/home/avery/Applications/stanford-postagger-2018-02-27/stanford-postagger.jar"
    tagger = StanfordPOSTagger(path_to_model, path_to_jar)
    tagger.java_options = "-mx8192m"
    # Use: tagger.tag(word_tokenize(string))
    return tagger
def pos_tagger(text):
    from nltk.tag.stanford import StanfordPOSTagger
    english_postagger = StanfordPOSTagger(
        'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/models/english-left3words-distsim.tagger',
        'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar'
    )
    english_postagger.java_options = '-mx4096m'
    tags = english_postagger.tag(text)
    return tags
Example #4
0
def posInput(text):
	print("POS")
	path_to_model = "./stanford-postagger/models/english-caseless-left3words-distsim.tagger"
	path_to_jar = "./stanford-postagger/stanford-postagger.jar"
	tagger=StanfordPOSTagger(path_to_model, path_to_jar)
	tagger.java_options='-mx4096m'          ### Setting higher memory limit for long sentences
	# sentence = 'THIS IS TESTING'
	result = tagger.tag(word_tokenize(text))
	# print result
	return result
Example #5
0
 def _POS(self, txt, id):
     self.df[['ID', 'pos']].to_csv('pos_ner.csv', sep='\t')
     path_pos = '/home/ise/NLP/stanfordNLP/stanford-postagger-full-2017-06-09/stanford-postagger.jar'
     model_path = '/home/ise/NLP/stanfordNLP/stanford-postagger-full-2017-06-09/models/english-bidirectional-distsim.tagger'
     from nltk.tag.stanford import StanfordPOSTagger
     tagger = StanfordPOSTagger(model_path, path_pos)
     tagger.java_options = '-mx8096m'  ### Setting higher memory limit for long sentences
     tokens = nltk.word_tokenize(txt)
     pos_res = tagger.tag(tokens)
     filepath = '/home/ise/NLP/NLP3/pos/pos_{}.txt'.format(id)
     with open(filepath, 'w') as file_handler:
         for item in pos_res:
             file_handler.write("{}\n".format(item))
     return pos_res
Example #6
0
def transform_to_pos(text):
    import os
    #os.environ['JAVAHOME'] = java_path
    from nltk.corpus import sentiwordnet as swn
    from nltk.tag.stanford import StanfordPOSTagger
    from nltk import word_tokenize

    path_to_model = "./postagging/english-bidirectional-distsim.tagger"
    path_to_jar = "./postagging/stanford-postagger.jar"
    tagger = StanfordPOSTagger(path_to_model, path_to_jar)
    tagger.java_options = '-mx4096m'  ### Setting higher memory limit for long sentences
    tokens = word_tokenize(text)
    size = len(tokens)
    from collections import Counter
    pos = tagger.tag(tokens)
    counts = Counter(tag for word, tag in pos)
    for key in counts:
        counts[key] /= size
    counts["totalWordsCount"] = size
    counts[";"] = tokens.count(";") / size
    counts["questionmarks"] = tokens.count("?") / size
    counts["exclamationmarks"] = tokens.count("!") / size
    counts["Quotes"] = tokens.count("\"") / size
    try:
        counts.pop(".")
    except:
        pass
    from collections import OrderedDict
    ot = [
        'NNP', 'VBD', 'VBN', 'IN', 'CD', 'VBP', ',', 'DT', 'NN', 'JJ', 'RB',
        'TO', 'SYM', 'PRP', 'NNS', 'CC', 'PRP$', 'POS', 'FW', 'VBG', ':',
        'WRB', 'EX', 'JJR', 'WDT', 'totalWordsCount', ';', 'questionmarks',
        'exclamationmarks', 'Quotes'
    ]
    counts = OrderedDict(counts)
    for key in ot:
        if key in counts:
            pass
        else:
            counts[key] = 0
    tmp = counts.copy()
    for key in tmp:
        if key not in ot:
            counts.pop(key, None)
    dab = {}
    for i in ot:
        dab[i] = counts[i]
    counts = dab.copy()
    return counts
def get_pos_sentence(sentences_spans, pos_vocab):
    """
    Get POS tags for each sentence. (needed to build end2end system)
    :param start:
    :param end:
    :return:
    """
    #raw_dir_simple = read.read_from_json('test/test_dir_simple')   #### in folder data/
    #raw_dir_simple = read.read_from_json('clinical_data/train_samples1_simples')
    #raw_dir_simple = read.read_from_json('agriculture_data/raw_dir_simple')

    #raw_dir_simple = ["NYT19980206.0466"]
    english_postagger = StanfordPOSTagger(
        'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/models/english-left3words-distsim.tagger',  #### in folder data/
        'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar'
    )  #### in folder data/
    english_postagger.java_options = '-mx8000m'
    pos_sentences = list()

    for sent_span in sentences_spans:
        print sent_span[0]
        text = nltk.word_tokenize(sent_span[0])
        text_pos = english_postagger.tag(
            text
        )  #####StanfordPnOSTagger failed to tag the underscore, see ttps://github.com/nltk/nltk/issues/1632  if use nltk 3.2.2, please change the code "word_tags = tagged_word.strip().split(self._SEPARATOR)" in function "parse_outputcode" of nltk.standford.py to "word_tags = tagged_word.strip().rsplit(self._SEPARATOR,1)" to handle undersocre issues

        index = 0
        for token in text_pos:
            # if (text[index] != token[0]) and (token[0] == '``' or token[0] == "''"):  ######### deal with the double quotes, in nltk.tokenize treebank.py change the tokenizer for double quotes. Reasons: (double quotes (") are changed to doubled single forward- and backward- quotes (`` and ''))
            #     text_pos[index] = ["\"", "\'\'"]
            if text[index] == token[0] and token[0] == "``" and text[
                    index] not in sent_span[0]:
                text_pos[index] = ["\"", "``"]
            if text[index] == token[0] and token[0] == "''" and text[
                    index] not in sent_span[0]:
                text_pos[index] = ["\"", "\'\'"]
            if text[index] == token[0] and token[0] in ['{', '(', '[']:
                text_pos[index] = [token[0], "("]
            if text[index] == token[0] and token[0] in ['}', ')', ']']:
                text_pos[index] = [token[0], ")"]
            pos_vocab[token[1]] += 1
            index += 1
        pos_sentences.append(text_pos)
    return pos_sentences, pos_vocab
def pos_sentence(start=0, end=63):
    """
    Get POS tags for each sentence. (needed to build end2end system)
    :param start:
    :param end:
    :return:
    """
    raw_dir_simple = read.read_from_json(
        'raw_dir_simple')  #### in folder data/
    english_postagger = StanfordPOSTagger(
        'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/models/english-left3words-distsim.tagger',  #### in folder data/
        'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar'
    )  #### in folder data/
    english_postagger.java_options = '-mx4096m'

    pos = list()

    for data_id in range(start, end):
        sentences_spans = read.read_from_json("training_sentence/sentences/" +
                                              raw_dir_simple[data_id])
        print raw_dir_simple[data_id]
        pos_sentences = list()
        for sent_span in sentences_spans:
            print sent_span[0]
            text = nltk.word_tokenize(sent_span[0])
            k = english_postagger.tag(
                text
            )  #####StanfordPnOSTagger failed to tag the underscore, see ttps://github.com/nltk/nltk/issues/1632  if use nltk 3.2.2, please change the code "word_tags = tagged_word.strip().split(self._SEPARATOR)" in function "parse_outputcode" of nltk.standford.py into "word_tags = tagged_word.strip().rsplit(self._SEPARATOR,1)" to handle undersocre issues
            index = 0

            for token in k:
                if (text[index] != token[0]) and (
                        token[0] == '``' or token[0] == "''"
                ):  ######### deal with the double quotes, in nltk.tokenize treebank.py change the tokenizer for double quotes. Reasons: (double quotes (") are changed to doubled single forward- and backward- quotes (`` and ''))
                    k[index] = ["\"", "\'\'"]
                if token[1] not in pos:
                    pos.append(token[1])
                index += 1
            pos_sentences.append(k)

        read.save_in_json("training_sentence/pos/" + raw_dir_simple[data_id],
                          pos_sentences)
    read.save_in_json("training_sentence/pos/pos_tag", pos)
Example #9
0
def get_pos_sentence(sentences_spans,pos_vocab):
    """
    Get POS tags for each sentence. (needed to build end2end system)
    :param start:
    :param end:
    :return:
    """
    #raw_dir_simple = read.read_from_json('test/test_dir_simple')   #### in folder data/
    #raw_dir_simple = read.read_from_json('clinical_data/train_samples1_simples')
    #raw_dir_simple = read.read_from_json('agriculture_data/raw_dir_simple')

    #raw_dir_simple = ["NYT19980206.0466"]
    english_postagger = StanfordPOSTagger(
        StandforParser,    #### in folder data/
        StandforParser_jar) #### in folder data/
    english_postagger.java_options = '-mx8000m'
    pos_sentences = list()

    for sent_span in sentences_spans:
        print(sent_span[0])
        text = nltk.word_tokenize(sent_span[0])
        text_pos = english_postagger.tag(text)   #####StanfordPnOSTagger failed to tag the underscore, see https://github.com/nltk/nltk/issues/1632  if use nltk 3.2.2, please change the code "word_tags = tagged_word.strip().split(self._SEPARATOR)" in function "parse_outputcode" of nltk.standford.py to "word_tags = tagged_word.strip().rsplit(self._SEPARATOR,1)" to handle undersocre issues

        index = 0
        for token in text_pos:
            # if (text[index] != token[0]) and (token[0] == '``' or token[0] == "''"):  ######### deal with the double quotes, in nltk.tokenize treebank.py change the tokenizer for double quotes. Reasons: (double quotes (") are changed to doubled single forward- and backward- quotes (`` and ''))
            #     text_pos[index] = ["\"", "\'\'"]
            if text[index] == token[0] and token[0] == "``"  and text[index] not in sent_span[0]:
                text_pos[index] = ["\"", "``"]
            if text[index] ==token[0] and token[0] == "''"  and text[index] not in sent_span[0]:
                text_pos[index] = ["\"", "\'\'"]
            if text[index] == token[0] and token[0] in ['{','(','['] :
                text_pos[index] = [token[0],"("]
            if text[index] == token[0] and token[0] in ['}',')',']']:
                text_pos[index] = [token[0],")"]
            pos_vocab[token[1]]+=1
            index+=1
        pos_sentences.append(text_pos)
    return pos_sentences,pos_vocab
def generate_pos(start=0, end=63):
    english_postagger = StanfordPOSTagger(
        'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/models/english-left3words-distsim.tagger',
        'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar'
    )
    english_postagger.java_options = '-mx4096m'
    raw_text_dir = read.read_from_json('raw_data_dir')
    data_size = len(raw_text_dir)
    pos = list()

    for data_id in range(start, end):

        raw_text = read.read_from_dir(raw_text_dir[data_id])
        print raw_text_dir[data_id]
        contents = list()
        for line in raw_text.splitlines():
            print line
            text = nltk.word_tokenize(line)
            print text
            if len(text) == 0:
                k = []
            else:
                k = english_postagger.tag(text)
                index = 0
                for token in k:
                    if (text[index] != token[0]) and (
                            token[0] == '``' or token[0] == "''"
                    ):  ######### deal with the double quotes, in nltk.tokenize treebank.py change the tokenizer for double quotes. Reasons: (double quotes (") are changed to doubled single forward- and backward- quotes (`` and ''))
                        k[index] = ["\"", "\'\'"]
                    if token[1] not in pos:
                        pos.append(token[1])
                    index += 1
            contents.append(k)

        read.save_json("data/pos/" + raw_text_dir[data_id].rsplit('\\', 1)[1],
                       contents)
    read.save_in_json("pos_tag", pos)
Example #11
0
from nltk.tag.stanford import StanfordPOSTagger
import os

path_to_model = os.path.join(
    os.getcwd(), "StanfordNLP/pos/models/english-bidirectional-distsim.tagger")
path_to_jar = os.path.join(os.getcwd(),
                           "StanfordNLP/pos/stanford-postagger.jar")
POStagger = StanfordPOSTagger(path_to_model, path_to_jar)
POStagger.java_options = '-mx4096m'  ### Setting higher memory limit for long sentences
from nltk.tokenize import word_tokenize


def getPOSFocus(sentence):
    sentence = sentence.lower()
    result = []
    tagged = POStagger.tag(word_tokenize(sentence))
    for (word, cat) in tagged:
        if cat.startswith("NN") or cat.startswith("JJ"):
            result.append(word.lower())
    return result


if __name__ == '__main__':
    text = "Where is HEC( Himalayan Explorers Club ) office ?"
    print getPOSFocus(text)
Example #12
0
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf


if __name__ == '__main__':
    path_to_model = 'stanford-postagger/models/english-bidirectional-distsim.tagger'
    path_to_jar = 'stanford-postagger/stanford-postagger.jar'
    tagger = StanfordPOSTagger(path_to_model, path_to_jar)
    tagger.java_options = '-mx512m'

    # Load the data-set
    pos_review = open('short_reviews/train/positive.txt',
                      encoding='ISO-8859-1').readlines()
    neg_review = open('short_reviews/train/negative.txt',
                      encoding='ISO-8859-1').readlines()

    multiThread_parse(pos_review, neg_review)
    # Save all the adjectives to a file
    with open("trained_models/documents.pickle", "wb") as save_documents:
        pickle.dump(documents, save_documents)

    all_words = nltk.FreqDist(all_words)
    print(len(list(all_words.keys())))
    word_features = list(all_words.keys())[:6000]
Example #13
0
# https://stackoverflow.com/questions/34692987/cant-make-stanford-pos-tagger-working-in-nltk
# https://nlp.stanford.edu/software/tagger.html
# http://www.nltk.org/_modules/nltk/tag/stanford.html#CoreNLPPOSTagger
from nltk.tag.stanford import StanfordPOSTagger

current_path = os.path.dirname(os.path.realpath(__file__))

path_to_model = "input/stanford/stanford-postagger-full-2018-10-16/models/english-bidirectional-distsim.tagger"
path_to_jar   = "input/stanford/stanford-postagger-full-2018-10-16/stanford-postagger.jar"

path_to_model = os.path.join(current_path, '..', path_to_model)
path_to_jar = os.path.join(current_path, '..', path_to_jar)

standford_tagger = StanfordPOSTagger(path_to_model, path_to_jar)
standford_tagger.java_options = '-mx1024m'          ### Setting higher memory limit for long sentences

# https://pythonprogramming.net/named-entity-recognition-stanford-ner-tagger/
from nltk.tag import StanfordNERTagger

path_to_model = "input/stanford/stanford-ner-2014-08-27/classifiers/english.all.3class.distsim.crf.ser.gz"
path_to_jar   = "input/stanford/stanford-ner-2014-08-27/stanford-ner.jar"

path_to_model = os.path.join(current_path, '..', path_to_model)
path_to_jar = os.path.join(current_path, '..', path_to_jar)

standford_ner = StanfordNERTagger(path_to_model, path_to_jar)
standford_ner.java_options = '-mx1024m'          ### Setting higher memory limit for long sentences


Example #14
0
def calculate_nouns_per_doc_per_part(rootdirectory, df_file_from_pos):
    from pos import prepare_full_doc
    import pandas as pd
    import re
    import os
    from log_creating import df_into_csv
    from nltk.tag.stanford import StanfordPOSTagger
    """ This function lets you calculate number of words per part per article per corpus;
	calculate nouns per part per article per corpus """
    df = pd.DataFrame({
        'Subcorpus': [],
        'Type of Article': [],
        'Article': [],
        'Teil': [],
        'WordsN': [],
        'NounsN': []
    })
    print("df is created", df)
    t = ''
    Teil1 = "<Intro>"
    Teil2 = "<Middle>"
    Teil3 = "<Conclusion>"
    path_to_model = "C:/Users/kole021/Downloads/stanford-postagger-full-2017-06-09/models/english-bidirectional-distsim.tagger"
    path_to_jar = "C:/Users/kole021/Downloads/stanford-postagger-full-2017-06-09/stanford-postagger.jar"
    tagger = StanfordPOSTagger(path_to_model, path_to_jar)
    java_path = "C:\\Program Files\\Java\\jdk1.8.0_181\\bin\\java.exe"
    os.environ['JAVAHOME'] = java_path
    tagger.java_options = '-mx4096m'
    for subdir, dirs, files in os.walk(rootdirectory):
        for file in files:
            #print(subdir)
            sub = re.sub(".*/", "", subdir)
            filepath = subdir + os.sep + file  #

            if (filepath.endswith(".txt")) & (
                    file.startswith("log.txt")
                    == False) & (file.startswith("README.txt") == False):
                word_count = 0
                CNC_up_3_count = 0
                words_for_CNCs = 0
                print(sub)
                print(filepath)
                print(file)
                print("Processing " + file + " in " + sub)
                t = prepare_full_doc(filepath)
                sentences = re.split("\.|\?|:|;", t)
                last_first_word = ""
                noun_count = 0
                word_number = 0
                Teile = {
                    Teil1: [0, 0],
                    Teil2: [0, 0],
                    Teil3: [0, 0]
                }  #0 = words number; 1 = nouns number;
                Current_teil = Teil1
                for sent in sentences:
                    p = []
                    sent = re.sub('\n', ' ', sent)  ###add more deleting enters
                    try:
                        p = tagger.tag(re.split(
                            " ", sent))  #list of tuples for every sentence
                    except:
                        print("failed to tag the sentence: " + sent)
                    for word in p:
                        word_number += 1
                        if ((len(word[0]) > 2) &
                            (re.search("[\d@+]", word[0]) == None) &
                            (re.search("\(.\)", word[0]) == None)) & (
                                (word[1] == 'NN') | (word[1] == 'NNS')):
                            noun_count += 1
                        if word[0] == Teil2:
                            print("End of Introduction")
                            print("Teil 1", word_number, noun_count)
                            Teile[Teil1][0] = word_number
                            Teile[Teil1][1] = noun_count
                            word_number = 0
                            noun_count = 0
                            Current_teil = Teil2
                        elif word[0] == Teil3:
                            print("End of Middle")
                            print("Teil 2", word_number, noun_count)
                            Teile[Teil2][0] = word_number
                            Teile[Teil2][1] = noun_count
                            word_number = 0
                            noun_count = 0
                            Current_teil = Teil3
                print("Teil 3", word_number, noun_count)
                print("End of Conclusion")
                Teile[Teil3][0] = word_number
                Teile[Teil3][1] = noun_count
                word_number = 0
                noun_count = 0
                df = df.append(
                    {
                        'Subcorpus': sub,
                        'Type of Article': "",
                        'Article': file,
                        'Teil': "<Intro>",
                        'WordsN': Teile[Teil1][0],
                        'NounsN': Teile[Teil1][1]
                    },
                    ignore_index=True)
                df = df.append(
                    {
                        'Subcorpus': sub,
                        'Type of Article': "",
                        'Article': file,
                        'Teil': "<Middle>",
                        'WordsN': Teile[Teil2][0],
                        'NounsN': Teile[Teil2][1]
                    },
                    ignore_index=True)
                df = df.append(
                    {
                        'Subcorpus': sub,
                        'Type of Article': "",
                        'Article': file,
                        'Teil': "<Conclusion>",
                        'WordsN': Teile[Teil3][0],
                        'NounsN': Teile[Teil3][1]
                    },
                    ignore_index=True)
                print(df)
    ##for_changes = pd.read_csv(df_file_from_pos, sep = '\t', header = 0)
    ##table = pd.pivot_table(for_changes, index = ['Article', 'Teil'], values = 'CNC Length', aggfunc = np.sum)
    ##print(type(table))

    df_into_csv(df, "df_nouns_and_all_words_counts_whole_corpus_part_2.csv")
Example #15
0
def	get_CNCs_up_3_of_a_txt_file(df2, df, filepath, file, sub, parts_directory): 
	""" Gets all CNCs of one text and creates files in parts folder for them """
	#print("get_CNCs_up_3_of_a_txt_file",df)
	import os
	import pandas as pd
	from nltk.tag.stanford import StanfordPOSTagger
	path_to_model = "C:/Users/kole021/Downloads/stanford-postagger-full-2017-06-09/models/english-bidirectional-distsim.tagger"
	path_to_jar = "C:/Users/kole021/Downloads/stanford-postagger-full-2017-06-09/stanford-postagger.jar"
	tagger=StanfordPOSTagger(path_to_model, path_to_jar)
	java_path = "C:\\Program Files\\Java\\jdk1.8.0_181\\bin\\java.exe"
	os.environ['JAVAHOME'] = java_path
	tagger.java_options='-mx4096m'
	from log_creating import write_log
	import re
	words_for_CNCs = 0
	word_count = 0
	CNC_up_3_count = 0
	noun_count = 0
	if (filepath.endswith(".txt"))&(file.startswith("log.txt")==False)&(file.startswith("README.txt")==False):
		word_count = 0
		noun_count = 0
		Teil1  = "<Intro>"
		Teil2 = "<Middle>"
		Teil3 = "<Conclusion>"
		Ende  = "<Ending>"
		Teile2 = {Teil1: [0, 0, 0], Teil2: [0, 0, 0], Teil3: [0, 0, 0]} 
		CNC_up_3_count = 0
		words_for_CNCs = 0
		print(sub)
		print (filepath)
		print(file)
		write_log(parts_directory, "Processing " + file + " in " + sub)
		t = prepare_full_doc(filepath)
		sentences = re.split("\.|\?|:|;", t)
		
		sent_number = 0
		word_number = 0

		Teile = {Teil1: [0, 0, 0, 0], Teil2: [0, 0, 0, 0], Teil3: [0, 0, 0, 0]} #0 = word count; 1 = CNC p 3 count; 2  = how many words contain CNCs; 3=what part of all words that are in CNCs are the CNCs in this particular part
		Current_teil = Teil1
		for sent in sentences:
			p = []
			sent = re.sub('\n', ' ',sent ) ###add more deleting enters
			sent_number += 1
			try:
				p = tagger.tag(re.split(" ", sent)) #list of tuples for every sentence
			except:
				write_log(parts_directory, "failed to tag the sentence: " + sent)
			maxCNC = 0
			maxCNCstring = ""
			last_first_word = ""
			other_word = False

			for word in p:
				#print(word)
				word_number += 1
				#print(word_number)
				word_count += 1
				##print("word_count = ", word_count)
				if ((len(word[0])>2) & (re.search("[\d@+]", word[0])==None) & (re.search("\(.\)",word[0])==None))&(  (word[1] == 'NN')|(word[1] == 'NNS')|( ((word[1] == 'JJ')|(word[1] == 'JJR')|(word[1] == 'JJS'))&(maxCNC == 0) )  ):
					word_word = word[0]
					if ((len(word[0])>2) & (re.search("[\d@+]", word[0])==None) & (re.search("\(.\)",word[0])==None))&(  (word[1] == 'NN')|(word[1] == 'NNS')  ):
							noun_count += 1 
					#print(word)
					if re.search("/", word_word)==None:
						word_word = re.split("/",word_word)[0]
					if other_word == True:
						if last_first_word != "":
							maxCNCstring = last_first_word + " " + word_word
							last_first_word = ""
							maxCNC = 2
							other_word = False
						else:							
							other_word = False
							CNC_up_3_count, words_for_CNCs, df = create_maxCNC_txt_in_parts_if_up3_and_count(df, words_for_CNCs, CNC_up_3_count, maxCNCstring, maxCNC, parts_directory, sub, file, Current_teil, sent_number, word_number, sentences)
							maxCNC = 1
							maxCNCstring = word_word
					else:
						if ((word_word[len(word_word)-1]==")")|(word_word[len(word_word)-1]=="’")|(word_word[len(word_word)-1]=="’")|(word_word[len(word_word)-1]==",")|(word_word[len(word_word)-1]==";")|(word_word[len(word_word)-1]=="'")|(word_word[len(word_word)-1]=="'")|(word_word[len(word_word)-1]==":")):
							
							other_word  = True
							maxCNC += 1
							maxCNCstring += " " + word_word
							last_first_word = ""
						else:
							try:
								if re.search("[:\"\(\[]", word_word[0])!=None:
									last_first_word = word_word
									other_word = True
								else:
									maxCNC += 1
									maxCNCstring += " " + word_word
							except:
								write_log(parts_directory, "Fail to process ( in word_word: " + word_word)
					##if ((word_word[len(word_word)-2]==")")|(word_word[len(word_word)-2]=="’")|(word_word[len(word_word)-2]=="’")|(word_word[len(word_word)-2]==",")|(word_word[len(word_word)-2]==";")|(word_word[len(word_word)-2]=="'")|(word_word[len(word_word)-2]=="'")|(word_word[len(word_word)-2]==":")):
					##	other_word  = True
				else:
					other_word  = True
					last_first_word = ""
				if re.search(Teil2, word[0])!=None:
					write_log(parts_directory, "End of Introduction")
					print("Teil 1", word_count, CNC_up_3_count, words_for_CNCs)
					Teile[Teil1][0] = word_count
					Teile[Teil1][1] = CNC_up_3_count
					Teile[Teil1][2] = words_for_CNCs

					Teile2[Teil1][0] = word_count
					Teile2[Teil1][1] = noun_count
					word_count = 0
					noun_count = 0

					CNC_up_3_count = 0
					words_for_CNCs = 0
					Current_teil = Teil2
				elif re.search(Teil3, word[0])!=None:
					write_log(parts_directory, "End of Middle")
					print("Teil 2", word_count, CNC_up_3_count, words_for_CNCs)
					Teile[Teil2][0] = word_count
					Teile[Teil2][1] = CNC_up_3_count
					Teile[Teil2][2] = words_for_CNCs

					Teile2[Teil2][0] = word_count
					Teile2[Teil2][1] = noun_count
					noun_count = 0
					word_count = 0

					CNC_up_3_count = 0
					words_for_CNCs = 0
					Current_teil = Teil3
				elif re.search(Ende, word[0])!=None:
					print(word[0], " THIS IS WORD 0 BY ENDE")
					print("Teil 3", word_count, CNC_up_3_count, words_for_CNCs)
					write_log(parts_directory, "End of Conclusion")
					Teile[Teil3][0] = word_count
					Teile[Teil3][1] = CNC_up_3_count
					Teile[Teil3][2] = words_for_CNCs

					Teile2[Teil3][0] = word_count
					Teile2[Teil3][1] = noun_count
					word_count = 0
					noun_count = 0
					df2 = df2.append({'Subcorpus': sub, 'Type of Article': "", 'Article': file, 'Teil': "<Intro>", 'WordsN': Teile2[Teil1][0], 'NounsN': Teile2[Teil1][1], 'CNCsNounsN':Teile[Teil1][2]}, ignore_index=True)
					df2 = df2.append({'Subcorpus': sub, 'Type of Article': "", 'Article': file, 'Teil': "<Middle>", 'WordsN': Teile2[Teil2][0], 'NounsN': Teile2[Teil2][1], 'CNCsNounsN':Teile[Teil2][2]}, ignore_index=True)
					df2 = df2.append({'Subcorpus': sub, 'Type of Article': "", 'Article': file, 'Teil': "<Conclusion>", 'WordsN': Teile2[Teil3][0], 'NounsN': Teile2[Teil3][1], 'CNCsNounsN':Teile[Teil3][2]}, ignore_index=True)
					print("new line in df2:", df2)

		CNC_up_3_count = 0
		words_for_CNCs = 0
		sum_CNCs = 0
		sum_total = 0
		word_count = 0
		noun_count = 0
		for teil in Teile.keys():
		# 	print("teil in Keys()", teil)
		# 	write_log(parts_directory, "words in the " + teil + str(Teile[teil][0]))
		# 	write_log(parts_directory, "CNCs up 3 in the " + teil + str(Teile[teil][1]))
		# 	write_log(parts_directory, "words that contains CNCs(N>=3) in " + teil + str(Teile[teil][2]))
			sum_CNCs += Teile[teil][2]
			sum_total += Teile[teil][0]
		full_ratio = sum_CNCs/sum_total
		for teil in Teile.keys():
			try:
				Teile[teil][3] = Teile[teil][2]/Teile[teil][0]
			except ZeroDivisionError:
				Teile[teil][3]  = 0 
			write_log(parts_directory, "words that contains CNCs(N>=3) in " + teil + " are " + str(Teile[teil][3]) + " \% of this teil")
		write_log(parts_directory, "In this document CNCs >=3 are " + str(full_ratio) + "\% of the text.")
	return df, df2