コード例 #1
0
class NERparser():
    def __init__(self):
        self.st = StanfordNERTagger('/home/joe32140/stanford/stanford-ner-2018-02-27/classifiers/english.all.3class.distsim.crf.ser.gz',
                               '/home/joe32140/stanford/stanford-ner-2018-02-27/stanford-ner.jar',
                                                   encoding='utf-8')

    def getNER_sents(self, sents):
        tokenized_sents = [word_tokenize(sent) for sent in sents]
        classified_sents = self.st.tag_sents(tokenized_sents)
        return classified_sents

    def count_entity(self, entity, table):

        if entity[0] not in table[entity[1]]:
            table[entity[1]][entity[0]]=str(len(table[entity[1]].keys()))
        return table[entity[1]][entity[0]]


    def replace(self, sents):
        classified_sents =self.getNER_sents(sents)
        new_sentences=[]
        for i, sent in enumerate(classified_sents):
            if i%5==0:
                check_repeat={'PERSON':{}, 'LOCATION':{}, 'ORGANIZATION':{}}
            tmp=[]
            for w in sent:
                if w[1]!='O':
                    count = self.count_entity(w, check_repeat)
                    tmp.append(w[1]+'_'+str(count))
                else:
                    tmp.append(w[0])
            new_sentences.append(' '.join(tmp))
        return new_sentences
コード例 #2
0
def extract_ne(sents):
    from nltk.tag import StanfordNERTagger
    import nltk
    
    st = StanfordNERTagger('ner/english.all.3class.distsim.crf.ser.gz', 'ner/stanford-ner.jar')
    
    
    
    sents_tk = []
    for sent in sents:
        sent_tk = nltk.word_tokenize(sent)
        sents_tk.append(sent_tk)
        
    
    ne = st.tag_sents(sents_tk)
    
    res = []    
    for sent in ne:
        last_tag = "O"
        en = ""
        sent.append(("", "O"))        
        
        for (word, tag) in sent:
            if tag == 'O':
                if en != "": res.append(en); en = ""            
            elif last_tag == tag:
                en += " " + word
            else:
                if en != "": res.append(en); en = ""
                en = word
            
            last_tag = tag
                
    return (ne, res)
        
    def nonlocal_ner_tag_tokens(self):
        home = expanduser("~")
        os.environ['CLASSPATH'] = home + '/stanford-ner-2015-12-09'
        os.environ[
            'STANFORD_MODELS'] = home + '/stanford-ner-2015-12-09/classifiers'

        st = StanfordNERTagger("english.all.3class.distsim.crf.ser.gz",
                               java_options='-mx4000m')

        stanford_dir = st._stanford_jar[0].rpartition('/')[0]
        stanford_jars = find_jars_within_path(stanford_dir)

        st._stanford_jar = ':'.join(stanford_jars)

        # do not tokenise text
        nltk.internals.config_java(
            options=
            '-tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions "tokenizeNLs=true"'
        )

        self.nonlocal_ner_doc_tokens = []
        temp_nonlocal_bulk_process = []
        length_of_docs = [len(doc) for doc in self.tokenized_docs_by_lines]
        for doc_idx, doc in enumerate(self.tokenized_docs_by_lines):
            for line_idx, line in enumerate(doc):
                temp_nonlocal_bulk_process.append(line)

        temp_nonlocal_bulk_process = st.tag_sents(temp_nonlocal_bulk_process)

        current_idx = 0
        for doc_len_idx, doc_len in enumerate(length_of_docs):
            self.nonlocal_ner_doc_tokens.append(
                temp_nonlocal_bulk_process[current_idx:current_idx + doc_len])
            current_idx += doc_len
        print("NER nonlocal tagged tokens")
コード例 #4
0
    def get_named_entities_sents(self, sents):
        dir_path = os.path.dirname(os.path.realpath(__file__))
        #print("ner: current working directory is ", dir_path)
        ner_tagger_path = dir_path + r"/resources/stanford-ner.jar"
        german_model = dir_path + r"/resources/german.conll.hgc_175m_600.crf.ser.gz"
        #print(ner_tagger_path)
        tagger = StanfordNERTagger(german_model,
                                   ner_tagger_path,
                                   encoding="UTF-8")  # iso-8859-15
        tagger.java_options = '-mx2048 -Xmx2048m -Xms2048m'
        nltk.internals.config_java(options='-xmx2G')

        print("Running named entity recognition on sentences")
        t0 = time()

        self.named_entities = tagger.tag_sents(sents)
        print(len(self.named_entities), " named entitites found")

        print("done in %0.3fs" % (time() - t0))
        return self.sort_named_entities()
コード例 #5
0
    def create_video_roles_timeline(self, subtitle_path):
        if subtitle_path is None:
            raise SubtitleNotFound(
                f"Could not find video's subtitle in path: {subtitle_path}")
        subs = pysrt.open(subtitle_path)
        subs_entities_timeline_dict = {}

        re_brackets_split = re.compile(r"(\[.*?\]|.*?:|^\(.*?\)$)")
        # (\[(.* ?)\] | (.* ?)\: | ^ \((.* ?)\)$)
        cc = RemoveControlChars()
        subs_clean = [
            cc.remove_control_chars(s.text.strip('-\\\/').replace("\n", " "))
            for s in subs
        ]
        subs_clean = [re.sub(r'<[^<]+?>', '', s) for s in subs_clean]
        brackets = [re_brackets_split.findall(s) for s in subs_clean]
        subs_text = [word_tokenize(s) for s in subs_clean]
        st = StanfordNERTagger(STANFORD_NLP_MODEL,
                               encoding='utf-8',
                               path_to_jar=STANFORD_NLP_JAR)

        nlp = spacy.load('en_core_web_sm',
                         disable=['parser', 'tagger', 'textcat'])
        entities_spacy = [[(ent.text, ent.label_) for ent in nlp(s).ents]
                          for s in subs_clean]

        entities_nltk = st.tag_sents(subs_text)

        for s, e_n, e_s, b in zip(subs, entities_nltk, entities_spacy,
                                  brackets):
            roles = self._video_role_analyzer.find_roles_names_in_text_ner(
                e_n, e_s)
            for item in b:
                roles.update(
                    self._video_role_analyzer.find_roles_names_in_text(item))
            # role_counter.update(roles)
            if len(roles) > 0:
                t = s.start.seconds + s.start.minutes * 60
                subs_entities_timeline_dict[t] = roles
        logging.debug(str(subs_entities_timeline_dict))
        return subs_entities_timeline_dict
コード例 #6
0
class NERTagger():
    def __init__(self):
        stanford_ner_dir = '/Users/Rena/StandfordParserData/stanford-ner-2018-02-27/'
        eng_model_filename = stanford_ner_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz'
        my_path_to_jar = stanford_ner_dir + 'stanford-ner.jar'

        self.tagger = StanfordNERTagger(model_filename=eng_model_filename,
                                        path_to_jar=my_path_to_jar)
        self.ner_cache = {}
        self.time_list = [
            'january', 'february', 'march', 'april', 'may', 'june', 'july',
            'august', 'september', 'october', 'november', 'december'
        ]
        self.ordinal_list = [
            'first', 'largest', 'highest', 'second', 'third', 'fourth',
            'fifth', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
            'eight', 'nine', 'ten'
        ]

    def cache_sents(self, sents):
        ###cache the documents###
        tokenised_sent = map(lambda x: x.split(), sents)
        tagged = self.tagger.tag_sents(tokenised_sent)
        for i in range(len(tokenised_sent)):
            self.ner_cache[sents[i]] = tagged[i]
        return True

    def tag(self, sents):

        pattern = '([^A-Z]\.\s[A-Z])'
        if re.search(pattern, sents):
            sentences = self.split_para(sents)
            entity_list = []
            for s in sentences:

                try:
                    tagged = self.ner_cache[s]
                except KeyError:
                    sent = map(self.strip_word, s.split())
                    tagged = self.tagger.tag(sent)
                    self.ner_cache[s] = tagged
                entity = self.entity_parse(tagged)
                entity_list.append(entity)
            result = sum(entity_list, [])
        else:

            try:
                tagged_sents = self.ner_cache[sents]
            except KeyError:
                sen = map(self.strip_word, sents.split())
                tagged_sents = self.tagger.tag(sen)
                self.ner_cache[sents] = tagged_sents

            result = self.entity_parse(tagged_sents)
        return result

    def strip_word(self, word):
        pattern = ('"",:.?!;' '')
        return word.strip(pattern)

    def split_para(self, para):
        pattern = '([^A-Z]\.\s[A-Z])'
        splitted = re.split(pattern, para)
        matching_pattern = '^[^A-Z]\.\s[A-Z]$'

        for i in range(len(splitted)):
            if re.match(matching_pattern, splitted[i]):
                symbols = splitted[i].split()
                try:
                    splitted[i - 1] += symbols[0]
                    splitted[i + 1] = symbols[1] + splitted[i + 1]
                except:
                    continue

        proper_splitted = []
        for i in range(len(splitted)):
            if re.match(matching_pattern, splitted[i]):
                continue
            else:
                proper_splitted.append(splitted[i])

        return proper_splitted

    def entity_parse_detail(self, tagged_sent):
        ###entity parsing method for detailed tagset###
        start = True
        retagged_entity = []
        for item in tagged_sent:
            token, tag = item
            if token.lower() in self.time_list:
                tag = 'MONTH'
            elif re.match('^[1|2][0-9]{3,3}$', token):
                tag = 'YEAR'
            elif token.lower() in self.ordinal_list:
                tag = 'NUMBER'
            elif tag == 'ORGANIZATION':
                tag = 'OTHER'
            elif tag == 'O':
                if not start and len(token) > 0 and token[0].isupper():
                    tag = 'OTHER'
                elif any(char == '%' for char in token):
                    tag = 'NUMBER'  #'PERCENT'
                elif any(char == '$' for char in token):
                    tag = 'NUMBER'  #'MONEY'
                elif any(char.isdigit() for char in token):
                    tag = 'NUMBER'

            if start:
                start = False
            retagged_entity.append((token, tag))

        retagged_entity = self.retag_date(retagged_entity)

        return self.gather_entity(retagged_entity)

    def retag_date(self, tagged_entity):
        ###gather NUMBER MONTH YEAR pattern into DATE###
        result_entity = []
        i = 0
        while i < len(tagged_entity) - 2:
            (token1, tag1) = tagged_entity[i]

            if tag1 == 'NUMBER':
                (token2, tag2) = tagged_entity[i + 1]
                if tag2 == 'MONTH':
                    (token3, tag3) = tagged_entity[i + 2]
                    if tag3 == 'YEAR':
                        result_entity.append((token1, 'DATE'))
                        result_entity.append((token2, 'DATE'))
                        result_entity.append((token3, 'DATE'))
                        i = i + 3
                        continue
            elif tag1 == 'MONTH':
                (token2, tag2) = tagged_entity[i + 1]
                if tag2 == 'NUMBER':
                    (token3, tag3) = tagged_entity[i + 2]
                    if tag3 == 'YEAR':
                        result_entity.append((token1, 'DATE'))
                        result_entity.append((token2, 'DATE'))
                        result_entity.append((token3, 'DATE'))
                        i = i + 3
                        continue
            result_entity.append((token1, tag1))
            i += 1
        for counter in range(len(tagged_entity) - i):
            result_entity.append(tagged_entity[i + counter])

        return result_entity

    def entity_parse(self, tagged_sent):
        ###entity parsing for general tagset###
        start = True
        retagged_entity = []
        for item in tagged_sent:
            token, tag = item
            if token.lower() in self.time_list:
                tag = 'NUMBER'
            if token.lower() in self.ordinal_list:
                tag = 'NUMBER'
            if tag == 'ORGANIZATION':
                tag = 'OTHER'
            if tag == 'O':
                if not start and len(token) > 0 and token[0].isupper():
                    tag = 'OTHER'
                elif any(char.isdigit() for char in token):
                    tag = 'NUMBER'

            if start:
                start = False
            retagged_entity.append((token, tag))

        return self.gather_entity(retagged_entity)

    def gather_entity(self, retagged_entity):
        ###gather continuous entities###
        gathered_entity = []
        tag = 'O'
        token = ''
        for (new_token, new_tag) in retagged_entity:
            if tag == new_tag:

                token = token + ' ' + new_token

            else:
                if tag != 'O':

                    gathered_entity.append((token, tag))
                tag = new_tag
                token = new_token
        if tag != 'O':

            gathered_entity.append((token, tag))

        return gathered_entity
コード例 #7
0
ファイル: novel.py プロジェクト: IDSIA/novel2graph
class Novel:

    def __init__(self, txt_file):
        CLASSIFIER = 'english.muc.7class.distsim.crf.ser.gz'
        root = os.path.join(os.getcwd(), '..', 'libraries', 'stanford-ner-2018-10-16')
        ner_jar_file = os.path.join(root, 'stanford-ner.jar')
        ner_classifier = os.path.join(root, 'classifiers/' + CLASSIFIER)
        self.tagger = StanfordNERTagger(ner_classifier, ner_jar_file, encoding='utf-8')
        np.set_printoptions(threshold=sys.maxsize)
        logging.getLogger().setLevel(logging.INFO)
        STOP = stopwords.words('english') + list(string.punctuation)
        self.file = txt_file
        self.text = ''
        self.persons = []
        self.sentences = []
        self.aliases = []

    def read(self, path=''):
        if os.path.isfile(path + self.file):
            for encode in ENCODING:
                try:
                    file = open(path + self.file, 'r', encoding=encode)
                    text = file.read()
                    self.original_text = text
                    self.text = re.sub(pattern='\s+', repl=' ', string=self.text).strip()
                    # text = text.replace('\n', ' ')
                    text = re.sub(' +', ' ', text)
                    text = text.strip()
                    self.text = text
                    self.sentences = sent_tokenize(text)
                    break
                except IOError:
                    logging.error('\t Cannot open ' + self.file)
                    exit(-1)
                except UnicodeDecodeError:
                    logging.warning('\t Cannot open file using encoding ' + encode + ' trying a new encoding!')

    def custom_coref_resolved(self, doc):
        ''' Use this method instead of doc._.coref_resolved, here we clean the character's name before to replace it.
        That because sometimes the coref method identifies commas, quotes,... as part of the name'''

        clusters = doc._.coref_clusters
        resolved = list(tok.text_with_ws for tok in doc)
        for cluster in clusters:
            for coref in cluster:
                if coref != cluster.main:
                    new_name = cluster.main.text.translate(str.maketrans('', '', string.punctuation)).strip()
                    resolved[coref.start] = new_name + doc[coref.end - 1].whitespace_
                    for i in range(coref.start + 1, coref.end):
                        resolved[i] = ""
        return ''.join(resolved)

    def coreference(self):
        nlp = spacy.load("en_core_web_sm")
        coref = neuralcoref.NeuralCoref(nlp.vocab)
        nlp.add_pipe(coref, name='neuralcoref')
        words = self.dealiased_text.split(' ')
        words_number = len(words)
        badge_size = 100000
        if words_number > badge_size:
            if words_number % badge_size == 0:
                iterations = int(words_number / badge_size)
            else:
                iterations = int(words_number / badge_size)
                iterations += 1

            new_text = ""
            for i in range(0, iterations):
                logging.info('Coreferencing part ' + str(i + 1) + ' of ' + str(iterations))
                from_index = i * badge_size
                to_index = (i+1) * badge_size
                sub_text = ' '.join(words[from_index:to_index])

                text_coreference = nlp(sub_text)
                # text = text_coreference._.coref_resolved
                new_text += self.custom_coref_resolved(text_coreference)
        else:
            new_text = self.dealiased_text

        self.dealiased_text = new_text

    def create_cluster_repetitions_df(self):
        self.cluster_repetitions_df = pd.DataFrame(
            data=[['CCHARACTER' + str(key), val[0], val[1]] for key, val in self.cluster_repetitions.items()],
            columns=['Alias', 'Names', 'Occurrences'])

    def parse_persons(self):
        people = {}
        name = ""
        # contains_punctuations = False
        tokenized_sentences = [wtk(sentence) for sentence in self.sentences]
        tagged_sentences = self.tagger.tag_sents(tokenized_sentences)
        for sentence in tagged_sentences:
            for word, tag in sentence:
                # a name is made of 1 or more names, read all
                if tag == 'PERSON':
                    if len(word) == 1:
                        # print(word)
                        continue
                    # all strange symbols: '!"”“#$%"&\'’()*+,./:;<=>?@[]^_`{|}~ʹ'
                    # if word start or end with special characters, drop them
                    # if word[0] in '!"”"“#$%"&\'’()*+,/:;<=>?@[]^_`{|}~ʹ':
                    #     word = word[1:]
                    #     contains_punctuations = True
                    # if word[-1] in '!"”"“#$%"&\'’()*+,/:;<=>?@[]^_`{|}~ʹ':
                    #     word = word[:-1]
                    #     contains_punctuations = True
                    if name == "":
                        name += word
                    else:
                        name += " " + word
                else:
                    # name is not empty
                    if name:
                        name = name.strip()
                        current_name = name.split(" ")
                        # if len(current_name) >= 2 and contains_punctuations:
                        #     print(name)
                        # Usually and/ed/or are identified as name, e.g. Tom and Jerry
                        if len(current_name) == 3 and (current_name[1] == 'and' or current_name[1] == 'to' or \
                                                       current_name[1][-2:] == 'ed' or current_name[1] == 'or' or \
                                                       current_name[1] == 'nor'):
                            people[current_name[0]] = people.get(current_name[0], 0) + 1
                            people[current_name[2]] = people.get(current_name[2], 0) + 1
                        # Usually 2 words name contains adverbs or adjectives (...ly) verb (...ed), remove them
                        elif len(current_name) == 2 and ((current_name[1] in string.punctuation) or \
                                                         (current_name[1][-2:] == 'ed') or \
                                                         (current_name[1][-2:]) == 'ly' or \
                                                         (current_name[1].lower() in CONJUNCTIONS)):
                            people[current_name[0]] = people.get(current_name[0], 0) + 1
                        elif len(current_name) == 1 and current_name[0] in FALSE_POSITIVES:
                            name = ""
                        else:
                            people[name] = people.get(name, 0) + 1
                        name = ""
                        # contains_punctuations = False

        self.persons = collections.OrderedDict(sorted(people.items()))
        return

    def cluster_aliases(self):
        complete_alphabet_names = collections.defaultdict(list)
        simplified_alphabet_names = collections.defaultdict(list)
        for name in self.persons:
            split_name = name.lower().split()
            new_name = ""
            if len(split_name) == 1:  # single names do not have pre-names
                new_name = split_name[0]
            else:
                for name_part in split_name:
                    is_prename = False
                    for pre_name in PRE_NAMES:
                        if name_part == pre_name:
                            is_prename = True
                    if not is_prename:
                        new_name += " " + name_part
                new_name = new_name.strip()
            if len(new_name) == 0:
                new_name = name
            complete_alphabet_names[new_name[0].upper()].append(name)
            simplified_alphabet_names[new_name[0].upper()].append(new_name)

        clusters_number = 0
        db_names = defaultdict(list)
        db_simplified_names = defaultdict(list)
        for letter, names in simplified_alphabet_names.items():
            n_persons = len(names)
            similarities = np.empty((n_persons, n_persons))
            if len(names) == 1:
                db_names[clusters_number].append(complete_alphabet_names[letter][0])
                db_simplified_names[clusters_number].append(simplified_alphabet_names[letter][0])
                clusters_number += 1
                continue

            for i, person1 in enumerate(names):
                for j, person2 in enumerate(names):
                    # differ = difflib.SequenceMatcher(None, person1, person2) similarities[i][j] = differ.ratio()
                    # similarities[i][j] = fuzz.ratio(person1, person2)/100.
                    # similarities[i][j] = fuzz.token_sort_ratio(person1, person2) / 100.
                    # similarities[i][j] = fuzz.token_set_ratio(person1, person2) / 100.

                    # take the shortest word and find the
                    # similarity between this name and each subslice of the longer name (with the same length). It
                    # returns the higher value.
                    similarities[i][j] = fuzz.partial_ratio(person1, person2) / 100.

            # eps = find_best_eps(similarities)
            # print(letter, ': ', eps)
            eps = 0.3
            db = DBSCAN(metric='precomputed', min_samples=1, algorithm='brute', eps=eps).fit(1 - similarities)

            labels = db.labels_
            if -1 in labels:
                logging.info('Some names are not clustered')
            for i, name in enumerate(complete_alphabet_names[letter]):
                db_names[labels[i] + clusters_number].append(name)
                simplified_name = simplified_alphabet_names[letter][i]
                db_simplified_names[labels[i] + clusters_number].append(simplified_name)

            unique = np.unique(labels, return_counts=False)
            clusters_number += len(unique)

        cluster_rep = {}
        simple_cluster_rep = {}
        for id, some_names in db_names.items():
            repetitions = []
            for name in some_names:
                repetitions.append(self.persons[name])
            cluster_rep[id] = (some_names, repetitions)
            simple_cluster_rep[id] = (db_simplified_names[id], repetitions)

        # Debug here to discover which names are correctly clustered
        self.cluster_repetitions = cluster_rep
        self.simple_cluster_repetitions = simple_cluster_rep

    def find_persons_title(self):
        text = self.text.replace('\n', ' ')
        new_names = {}
        for name, occurrence in self.persons.items():
            pre_names = re.findall(r'([^ \r\n]+)( ' + name + ')([\r\n]| |$|.)', text, re.IGNORECASE)
            if len(pre_names) == 0:
                continue
            pre_names_occurrences = collections.defaultdict(int)
            for pre_name in pre_names:
                # skip prename which end with punctuations, it is not in the same phrase as the subject
                if pre_name[0][-1] in '!"”“#$%"&\'’()*+,./:;<=>?@[]^_`{|}~ʹ':
                    continue
                pre_names_occurrences[pre_name[0]] += 1
            if len(pre_names_occurrences) == 0:
                continue
            max_index = np.argmax(pre_names_occurrences.values())
            max_occurrence = list(pre_names_occurrences.values())[max_index]
            new_prename = list(pre_names_occurrences.keys())[max_index]
            if float(max_occurrence) / float(occurrence) > 0.5 and max_occurrence > 1:
                # skip special starting character in the pre-name
                if new_prename[0] in '!"”"“#$%"&\'’()*+,./:;<=>?@[]^_`{|}~ʹ':
                    new_prename = new_prename[1:] + ' ' + name

                if new_prename.lower() not in CONJUNCTIONS:
                    new_name = new_prename + ' ' + name
                    logging.info('Adding new name: %s', new_name)
                    new_names[new_name] = max_occurrence
                    PRE_NAMES.add(new_prename.lower())
                    logging.info('Adding new pre-name: %s', new_prename.lower())

        persons = self.persons
        for new_name, occurrence in new_names.items():
            if new_name not in persons:
                persons[new_name] = occurrence
            if new_name in persons:
                persons[new_name] += occurrence

        self.persons = collections.OrderedDict(sorted(persons.items()))

    def filter_similar_names(self, similarity):
        # Winsley is contained in many cluster, insert it into the cluster with more repetitions
        old_similarity = similarity
        for key_a, value_a in old_similarity.items():
            if len(value_a) > 1:
                id_best = -1
                best = -1
                for id in value_a:
                    repetitions = self.cluster_repetitions[id][1]
                    sum_repetitions = sum(repetitions)
                    if sum_repetitions > best:
                        best = sum_repetitions
                        id_best = id
                similarity[key_a] = [id_best]

        # add similar names to a cluster
        new_cluster = self.cluster_repetitions
        new_simple_cluster = self.simple_cluster_repetitions
        to_remove = set()
        to_delete_at_end = set()
        for key_a, value_a in similarity.items():
            if key_a in to_remove:
                continue

            # find other names with the same preference
            same_preferences = set()
            for key_b, value_b in similarity.items():
                if value_b[0] == value_a[0]:
                    same_preferences.add(key_b)

            # more key with the same preference
            selected_cluster = -1
            if len(same_preferences) > 1:
                # take the max
                max = -1
                best_key = -1
                for preference in same_preferences:
                    occurrences = sum(self.cluster_repetitions[preference][1]) + sum(
                        self.cluster_repetitions[similarity[preference][0]][1])
                    if occurrences > max:
                        max = occurrences
                        best_key = preference

                for preference in same_preferences:
                    if preference != key_a:
                        to_remove.add(preference)
                selected_cluster = best_key
            else:
                selected_cluster = list(same_preferences)[0]

            # check if the value of the best is also a key
            value = similarity[selected_cluster][0]
            if value in similarity:
                # the similarity is symmetric? A wants B and B wants A?
                if similarity[value] != selected_cluster:
                    # take the max and remove the other, a=AB and b=BC
                    occurrences_a = sum(self.cluster_repetitions[selected_cluster][1]) + sum(
                        self.cluster_repetitions[value][1])
                    occurrences_b = sum(self.cluster_repetitions[value][1]) + sum(
                        self.cluster_repetitions[similarity[value][0]][1])

                    non_selected_cluster = selected_cluster if np.argmin([occurrences_a, occurrences_b]) == 0 else value
                    selected_cluster = selected_cluster if np.argmax([occurrences_a, occurrences_b]) == 0 else value
                    to_remove.add(selected_cluster)
                    to_remove.add(non_selected_cluster)
                else:
                    to_remove.add(selected_cluster)
                    to_remove.add(value)
            else:
                to_remove.add(selected_cluster)

            # Update both the list with original names and the one with simplified names
            add_user = new_cluster[selected_cluster][0]
            add_repetition = new_cluster[selected_cluster][1]
            add_simple_user = new_simple_cluster[selected_cluster][0]

            cluster_repetitions = new_cluster[similarity[selected_cluster][0]]
            cluster_repetitions[0].extend(add_user)
            # next operation will update both the original and the simple names list
            cluster_repetitions[1].extend(add_repetition)
            cluster_repetitions = new_simple_cluster[similarity[selected_cluster][0]]
            cluster_repetitions[0].extend(add_simple_user)
            to_delete_at_end.add(selected_cluster)

        return to_delete_at_end, new_cluster, new_simple_cluster

    def associate_simple_single_names(self):
        single_names = []
        single_ids = []
        multiple_names = []
        multiple_ids = []
        # find clusters composed by only 1 name and clusters with more names
        for id, names_repetitions in self.simple_cluster_repetitions.items():
            names = names_repetitions[0]
            if len(names) == 1 or all(name == names[0] for name in names):
                single_names.append(names_repetitions)
                single_ids.append(id)
            else:
                multiple_names.append(names_repetitions)
                multiple_ids.append(id)

        # compute the similarity between the single names and all other clusters (also other single names)
        similarity = {}
        for key_a, single_name_repetitions in zip(single_ids, single_names):
            single_name = single_name_repetitions[0][0]
            # single_repetition = single_name_repetitions[1][0]

            for id, names_repetitions in self.simple_cluster_repetitions.items():
                if key_a != id:
                    names = names_repetitions[0]
                    for name in names:
                        if single_name in name or name in single_name:
                            # print(single_name, ' - ', names)
                            if key_a not in similarity:
                                similarity[key_a] = []

                            similarity[key_a].append(id)
                            break

        to_delete_at_end, new_cluster, new_simple_cluster = self.filter_similar_names(similarity)

        fix_indexes_cluster, fix_simple_indexes_cluster = self.delete_names_bottom_up(to_delete_at_end, new_cluster,
                                                                                      new_simple_cluster)
        self.cluster_repetitions = fix_indexes_cluster
        self.simple_cluster_repetitions = fix_simple_indexes_cluster

    def delete_names_bottom_up(self, to_delete_at_end, new_cluster, new_simple_cluster):
        # delete bottom up, to eliminate problem with indexes
        to_delete_at_end = sorted(list(to_delete_at_end), key=lambda x: x, reverse=True)
        for key_a in to_delete_at_end:
            del new_cluster[key_a]
            del new_simple_cluster[key_a]

        fix_indexes_cluster = {}
        fix_simple_indexes_cluster = {}
        i = 0
        for cluster_idx, values in new_cluster.items():
            fix_indexes_cluster[i] = values
            fix_simple_indexes_cluster[i] = new_simple_cluster[cluster_idx]
            i += 1

        return fix_indexes_cluster, fix_simple_indexes_cluster

    def associate_single_names(self):
        similarity = {}
        for id1, value1 in self.cluster_repetitions.items():
            if len(value1[0]) == 1:
                for id2, value2 in self.cluster_repetitions.items():
                    if id1 != id2:
                        single_name = value1[0][0]
                        if any(single_name in name for name in value2[0]):
                            if id1 not in similarity:
                                similarity[id1] = []

                            similarity[id1].append(id2)

        to_delete_at_end, new_cluster, new_simple_cluster = self.filter_similar_names(similarity)
        fix_indexes_cluster, fix_simple_indexes_cluster = self.delete_names_bottom_up(to_delete_at_end, new_cluster,
                                                                                      new_simple_cluster)
        self.cluster_repetitions = fix_indexes_cluster
        self.simple_cluster_repetitions = fix_simple_indexes_cluster

    def dealiases(self):
        replacements = {}
        for id, names_rep in self.cluster_repetitions.items():
            character = 'CCHARACTER' + str(id)
            names = names_rep[0]
            for name in names:
                replacements[name] = character

        ordered_replacements = {}
        for k in sorted(replacements, key=len, reverse=True):
            ordered_replacements[k] = replacements[k]

        self.dealiased_text = replace_words(self.text, ordered_replacements)
        return

    def store(self, filename, data, type='csv'):
        if type == 'csv':
            try:
                with open(filename, 'w', newline='', encoding="utf-8") as csvfile:
                    writer = csv.writer(csvfile)
                    for key, value in data.items():
                        writer.writerow([key, value])
            except IOError:
                logging.info("I/O error")
        else:
            with open(filename, "w", encoding="utf-8") as f:
                f.write(data)

    def remove_less_than(self, occurrences):
        new_persons = {}
        for name, occurrence in self.persons.items():
            if occurrence <= occurrences:
                continue
            else:
                new_persons[name] = occurrence

        self.persons = new_persons
        return
from nltk.tag import StanfordNERTagger
import pandas as pd
from sklearn.metrics import f1_score, confusion_matrix

from loader import Load

train, test = Load('c')

ner = StanfordNERTagger('./stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz', './stanford-ner-2018-10-16/stanford-ner.jar')

data = train

data['tweet'] = ner.tag_sents(data['tweet'].str.split(' '))


pred = []

for i, d in data.iterrows():
    tweet = d['tweet']
    tag = 'IND'
    for w in tweet:
        if w[1] == 'ORGANIZATION':
            tag = 'GRP'
        # elif w[1] == 'PEOPLE':
        #     tag = 'IND'

    pred.append(tag)

print(confusion_matrix(data['label'], pred))
print(f1_score(data['label'], pred, average='macro'))
コード例 #9
0
class NER:

    # Any one-time initialization code can go here.  There entire nested question-and-answer
    # dataset is passed as a parameter, in case the initialization requires any of that data.

    def init(self, allQuestions):
        os.environ[
            "STANFORD_MODELS"] = "./Features/stanford-ner-2014-06-16/classifiers/"
        os.environ[
            "CLASSPATH"] = "./Features/stanford-ner-2014-06-16/stanford-ner.jar"

        self.nerMachine = StanfordNERTagger(
            'english.all.3class.distsim.crf.ser.gz')

        sentences = []
        ids = []
        for q in allQuestions:
            ids.append("Q" + allQuestions[q]['id'])
            sentences.append(allQuestions[q]['question_words'])
            for r in allQuestions[q]['related']:
                ids.append("R" + allQuestions[q]['related'][r]['id'])
                sentences.append(
                    allQuestions[q]['related'][r]['question_words'])

        tagged = self.nerMachine.tag_sents(sentences)

        for i in range(0, len(ids)):
            id = ids[i]
            if id[0] == 'Q':
                qid = id[1:]
                allQuestions[qid]['ner'] = tagged[i]
            else:
                rid = id[1:]
                allQuestions[qid]['related'][rid]['ner'] = tagged[i]

        return

    # Given a specific question, return a feature vector (one-dimensional array of one
    # or more features.
    def createFeatureVector(self, question, parentQuestion):
        # This is just placeholder code - insert code that actually generates a feature vector here
        # for the given question, and then return that feature vector instead of [0].
        #question['ner']=self.nerMachine.tag(question['question_words'])
        #parentQuestion['ner'] = self.nerMachine.tag(parentQuestion['question_words'])
        #pprint(question['question_words'])
        #pprint(question['ner'])
        # print(question['ner'])
        # print(parentQuestion['ner'])
        qNer = []
        pNer = []
        for i in question['ner']:
            for j in parentQuestion['ner']:
                if not i[1] == 'O':
                    qNer.append(str.lower(i[0]))
                if not j[1] == 'O':
                    pNer.append(str.lower(j[0]))
        feature = len(list(set(qNer).intersection(pNer)))

        # for i in parentQuestion['ner']:
        #     if not i[1]=='O':
        #         pNer.append(str.lower(i[0]))
        # print(qNer)
        # print(pNer)
        # feature=len(list(set(qNer).intersection(pNer)))
        # print(feature)
        #Calculate named entity overlap
        return [feature]

    # Returns a list of names for the features generated by this module.  Each entry in the
    # list should correspond to a feature in the createFeatureVector() response.

    def getFeatureNames(self):
        return ['NER']
コード例 #10
0
class NLPCore:
    """
    nlp processing including Stanford Word Segmenter, Stanford POS Tagger, 
    Stanford Named Entity Recognizer and Stanford Parser 
    """
    def __init__(self):
        self.root_path = '../Models/stanfordNLP/'

        # word segmenter
        self.segmenter = StanfordSegmenter(
            path_to_jar=self.root_path + "stanford-segmenter.jar",
            path_to_slf4j=self.root_path + "log4j-over-slf4j.jar",
            path_to_sihan_corpora_dict=self.root_path + "segmenter/",
            path_to_model=self.root_path + "segmenter/pku.gz",
            path_to_dict=self.root_path + "segmenter/dict-chris6.ser.gz")

        # pos tagger
        self.posTagger = StanfordPOSTagger(
            self.root_path + 'pos-tagger/chinese-distsim.tagger',
            path_to_jar=self.root_path + "stanford-postagger.jar")

        # named entity recognizer
        self.nerTagger = StanfordNERTagger(
            self.root_path + 'ner/chinese.misc.distsim.crf.ser.gz',
            path_to_jar=self.root_path + 'stanford-ner.jar')

        self.parser = StanfordDependencyParser(
            model_path=self.root_path + 'lexparser/chinesePCFG.ser.gz',
            path_to_jar=self.root_path + 'stanford-parser.jar',
            path_to_models_jar=self.root_path +
            'stanford-parser-3.7.0-models.jar',
            encoding='gbk')

    def split_sent_stanford(self, textPair):
        """
        Stanford Word Segmenter, input should be raw text
        :return: also TextPair with raw string of results
        """
        t1 = self.segmenter.segment(textPair.t1)
        t2 = self.segmenter.segment(textPair.t1)

        if DEBUG:
            print(t1, t2)

        return text_pair.TextPair(t1, t2, textPair.label)

    def split_sents_stanford(self, textPairs):
        """
        Stanford Word Segmenter, input should be list of sents
        :return: also TextPair with raw string of results
        """
        sents1 = [textPair.t1 for textPair in textPairs]
        sents2 = [textPair.t2 for textPair in textPairs]

        split1 = self.segmenter.segment_sents(sents1).split('\n')
        split2 = self.segmenter.segment_sents(sents2).split('\n')

        rlist = []
        for i in range(len(textPairs)):
            rlist.append(
                text_pair.TextPair(split1[i], split2[i], textPairs[i].label))

            if DEBUG:
                print(split1[i], split2[i])

        return rlist

    def split_sent_jieba(self, textPair):

        jieba.setLogLevel('INFO')
        ger1 = jieba.cut(textPair.t1)
        ger2 = jieba.cut(textPair.t2)

        t1 = ' '.join(ger1)
        t2 = ' '.join(ger2)

        return text_pair.TextPair(t1, t2, textPair.label)

    def pos_tag(self, textPair):
        """
        Stanford POS Tagger, input should be splitted
        :return: also TextPair with raw string of results
        """
        t1_s = textPair.t1.split()
        t2_s = textPair.t2.split()

        t1_tag = ' '.join([ele[1] for ele in self.posTagger.tag(t1_s)])
        t2_tag = ' '.join([ele[1] for ele in self.posTagger.tag(t2_s)])

        if DEBUG:
            print(t1_tag, t2_tag)

        return text_pair.TextPair(t1_tag, t2_tag, textPair.label)

    def pos_tag_pairs(self, textPairs):
        """
        Stanford POS Tagger, input should be list of sents
        :return: also TextPair with raw string of results
        """
        sents1 = [textPair.t1.split() for textPair in textPairs]
        sents2 = [textPair.t2.split() for textPair in textPairs]

        tag1 = self.posTagger.tag_sents(sents1)
        tag2 = self.posTagger.tag_sents(sents2)

        rlist = []
        for i in range(len(tag1)):
            t1_tag = ' '.join([ele[1] for ele in tag1[i]])
            t2_tag = ' '.join([ele[1] for ele in tag2[i]])

            rlist.append(text_pair.TextPair(t1_tag, t2_tag,
                                            textPairs[i].label))

            if DEBUG:
                print(t1_tag, t2_tag)

        return rlist

    def ner_tag(self, textPair):
        """
        Stanford Named Entity Recognizer, input should be splitted
        :return: also TextPair with raw string of results
        """
        t1_s = textPair.t1.split()
        t2_s = textPair.t2.split()

        t1_ner = ' '.join(
            [ele[0] + '#' + ele[1] for ele in self.nerTagger.tag(t1_s)])
        t2_ner = ' '.join(
            [ele[0] + '#' + ele[1] for ele in self.nerTagger.tag(t2_s)])

        if DEBUG:
            print(t1_ner, t2_ner)

        return text_pair.TextPair(t1_ner, t2_ner, textPair.label)

    def ner_tag_pairs(self, textPairs):
        """
        Stanford Named Entity Recognizer, input should be list of sents
        :return: also TextPair with raw string of results
        """
        sents1 = [textPair.t1.split() for textPair in textPairs]
        sents2 = [textPair.t2.split() for textPair in textPairs]

        tag1 = self.nerTagger.tag_sents(sents1)
        tag2 = self.nerTagger.tag_sents(sents2)

        rlist = []
        for i in range(len(tag1)):
            t1_ner = ' '.join([ele[0] + '#' + ele[1] for ele in tag1[i]])
            t2_ner = ' '.join([ele[0] + '#' + ele[1] for ele in tag2[i]])

            rlist.append(text_pair.TextPair(t1_ner, t2_ner,
                                            textPairs[i].label))

            if DEBUG:
                print(t1_ner, t2_ner)

        return rlist

    def depen_parse(self, textPair):
        """
        Stanford Dependency Parser, input should be splitted
        :return: also TextPair with raw string of results
        """
        print([p.tree() for p in self.parser.raw_parse(textPair.t1)])
コード例 #11
0
ファイル: texttk.py プロジェクト: baifengbai/texttk
class TextPreprocesser(object):
    def __init__(self, decode_error='strict', strip_accents='unicode', ignore_list=[], lowercase=True, \
         remove_html=True, join_urls=True, use_bigrams=True, use_ner=True, stanford_ner_path="", \
         use_lemmatizer=False, max_df=0.95, min_df=1, max_features=None):
        self.stanford_ner_path = stanford_ner_path  # path to stanford NER
        self.decode_error = decode_error  # options: {‘strict’, ‘ignore’, ‘replace’}
        self.strip_accents = strip_accents  # options: {‘ascii’, ‘unicode’, None}
        self.ignore_list = ignore_list
        self.lowercase = lowercase
        self.remove_html = remove_html
        self.join_urls = join_urls
        self.use_bigrams = use_bigrams
        self.use_ner = use_ner
        self.use_lemmatizer = use_lemmatizer  # use lemmatizer instead of stemmer?
        self.max_df = max_df  # maximum document frequency
        self.min_df = min_df  # remove terms that occur in less than min_df documents
        self.max_features = max_features  # keep only top-N words according to tf across corpus

        self.sentence_splitter = PunktSentenceTokenizer(
        ).tokenize  # Punkt sentence splitter
        self.stemmer = SnowballStemmer("english").stem  # Snowball stemmer
        self.lemmatizer = WordNetLemmatizer().lemmatize  # WordNet lemmatizer
        self.base_tokenizer = CountVectorizer().build_tokenizer(
        )  # sklearn tokenizer works the best, I think...
        self.stop_words = stopwords.words(
            "english")  # nltk list of 128 stopwords
        self.token_pattern = re.compile(
            r'(?u)\b(\w*[a-zA-Z_]\w+|\w+[a-zA-Z_]\w*)\b'
        )  # default value was r'(?u)\b\w\w+\b'
        self.numeric_pattern = re.compile(r'^[0-9]+$')  # number regex
        self.url_pattern = re.compile(r'((http://)?(www\..*?\.\w+).*?)\s')
        self.compound_pattern = re.compile(r'\w+(\-\w+)+')
        if self.use_lemmatizer:
            self.tokenizer = CustomTokenizer(self.base_tokenizer,
                                             self.lemmatizer,
                                             self.token_pattern,
                                             self.numeric_pattern)
        else:
            self.tokenizer = CustomTokenizer(self.base_tokenizer, self.stemmer,
                                             self.token_pattern,
                                             self.numeric_pattern)

    def find_nbest_bigrams(self, corpus, n, metric, min_freq):
        print "finding top-%d bigrams using %s..." % (n, metric)
        alltokens = []
        simplerTokenizer = CustomTokenizer(self.base_tokenizer, lambda x: x,
                                           re.compile(".*"), re.compile("^$"))
        for doc in corpus:
            for token in [t for t in simplerTokenizer(doc)]:
                alltokens.append(token)
        bigram_measures = nltk.collocations.BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(alltokens)
        finder.apply_freq_filter(
            min_freq)  # bigrams must appear at least 5 times
        if metric.lower() == "pmi":
            best_bigrams = finder.nbest(bigram_measures.pmi,
                                        n)  # doctest: +NORMALIZE_WHITESPACE
        elif metric.lower() == "chi_sq":
            best_bigrams = finder.nbest(bigram_measures.chi_sq,
                                        n)  # doctest: +NORMALIZE_WHITESPACE
        else:
            raise Exception("Unknown metric for bigram finder")
        return best_bigrams

    def remove_punctuation(self, text):
        if not hasattr(self, 'simplerTokenizer'):
            self.simplerTokenizer = CustomTokenizer(self.base_tokenizer,
                                                    lambda x: x,
                                                    self.token_pattern,
                                                    self.numeric_pattern)
        tokens = self.simplerTokenizer(text)
        return ' '.join(tokens)

    def tag_corpus_ner(self, corpus):
        if not hasattr(self, 'stanford_ner'):
            self.stanford_ner = StanfordNERTagger(
                self.stanford_ner_path +
                "classifiers/english.all.3class.distsim.crf.ser.gz",
                self.stanford_ner_path + "stanford-ner.jar")
            self.stanford_ner._stanford_jar = self.stanford_ner_path + "stanford-ner.jar:" + self.stanford_ner_path + "lib/*"

        print "splitting sentences in corpus (for NER)..."
        corpus_sentences = []
        sentence_to_doc_map = {}
        sent_no = 0
        for d in xrange(len(corpus)):
            for sent in self.sentence_splitter(corpus[d]):
                corpus_sentences.append(sent)
                sentence_to_doc_map[sent_no] = d
                sent_no += 1
        tokenized_sentences = []
        for sent in corpus_sentences:
            tokenized_sentences.append(
                [t for t in re.split(r'\s+', sent) if len(t) > 0])
        #tokenized_sentences = [re.split(r'\s+', sent) for sent in corpus_sentences]

        print "tagging sentences with Stanford NER..."
        tagged_sentences = self.stanford_ner.tag_sents(tokenized_sentences)

        # process NER output
        tagged_corpus = []
        current_doc_no = 0
        current_doc = []
        for i in xrange(len(tagged_sentences)):
            doc_no = sentence_to_doc_map[i]
            if doc_no == current_doc_no:
                current_doc += tagged_sentences[i]
            else:
                tagged_corpus.append(current_doc)
                current_doc = []
                current_doc_no = doc_no
        tagged_corpus.append(current_doc)

        # get dictionary of named entities per document
        named_entities = []
        for tagged_doc in tagged_corpus:
            tags = {}
            current_ne = []
            for token, tag in tagged_doc:
                if current_ne:
                    if tag == "O" or (tag != "O" and tag != current_ne[-1][1]):
                        tags[' '.join([t for t, _ in current_ne
                                       ])] = current_ne[0][1]
                        current_ne = []
                if tag != "O":
                    current_ne.append((token, tag))
            if current_ne:
                tags[' '.join([t for t, _ in current_ne])] = current_ne[0][1]
            named_entities.append(tags)

        return tagged_corpus, named_entities

    def preprocess_corpus(self, corpus):
        print "preprocessing corpus..."
        print "corpus size:", len(corpus)

        # first pass over the corpus: prepare for NER
        print "first pass over the corpus...\n\tunescape characters"
        if self.remove_html: print "\tremove html"
        if self.strip_accents: print "\tstrip accents"
        if self.join_urls: print "\tjoin URLs"
        print "\tjoin compound words\n\tspace out punctuation"
        for d in xrange(len(corpus)):
            corpus[d] = HTMLParser.HTMLParser().unescape(corpus[d]) + " "
            if self.remove_html:
                corpus[d] = remove_html(corpus[d])
            if self.strip_accents == 'unicode':
                corpus[d] = strip_accents_unicode(corpus[d])
            if self.join_urls:
                corpus[d] = join_urls(corpus[d], self.url_pattern)
            corpus[d] = join_compound_words(corpus[d], self.compound_pattern)
            corpus[d] = space_out_punctuation(corpus[d])

        if self.use_ner:
            tagged_corpus, named_entities = self.tag_corpus_ner(corpus)

            # debug NER
            fw = codecs.open("debug_NER.txt", "w", "utf-8")
            for tags in named_entities:
                fw.write(unicode(tags.items()) + "\n")
            fw.close()

            print "merging named entities as single tokens..."
            for d in xrange(len(corpus)):
                tags = named_entities[d]
                for ne in tags:
                    corpus[d] = corpus[d].replace(ne, re.sub(r'\s+', '', ne))

        # second pass over the corpus: remove punctuation and convert to lowercase
        # (these were useful above for NER, but now can be removed)
        print "second pass over the corpus..."
        print "\tremove punctuation"
        if self.lowercase: print "\tconvert to lowercase"
        simplerTokenizer = CustomTokenizer(self.base_tokenizer, lambda x: x,
                                           self.token_pattern,
                                           self.numeric_pattern)
        for d in xrange(len(corpus)):
            corpus[d] = self.remove_punctuation(corpus[d])
            if self.lowercase:
                corpus[d] = corpus[d].lower()

        if self.use_bigrams:
            # find top N bigrams
            #best_bigrams = self.find_nbest_bigrams(corpus, 100, "pmi", 10)
            best_bigrams = self.find_nbest_bigrams(corpus, 100, "chi_sq", 10)

            # debug bigrams
            fw = codecs.open("debug_bigrams.txt", "w", "utf-8")
            for w1, w2 in best_bigrams:
                fw.write(w1 + " " + w2 + "\n")
            fw.close()

            print "merging bigrams as single tokens..."
            for d in xrange(len(corpus)):
                for w1, w2 in best_bigrams:
                    corpus[d] = corpus[d].replace(w1 + " " + w2, w1 + w2)

        return corpus

    def convert_to_bag_of_words(self, corpus):
        print "converting corpus to bag-of-words format..."
        print "\ttokenize documents\n\tremove stopwords"
        print "\tapply lemmatizer" if self.use_lemmatizer else "\tapply stemmer"
        print "\tremove rare words\n\tremove very frequent words"
        vectorizer = CountVectorizer(input='content',
                                     decode_error=self.decode_error,
                                     strip_accents=self.strip_accents,
                                     tokenizer=self.tokenizer,
                                     stop_words=self.stop_words +
                                     self.ignore_list,
                                     lowercase=self.lowercase,
                                     max_df=self.max_df,
                                     min_df=self.min_df,
                                     max_features=self.max_features)
        dtm = vectorizer.fit_transform(corpus)  # a sparse matrix
        vocab = vectorizer.get_feature_names()  # a list
        print "vocabulary size:", len(vocab)

        # debug vocabulary
        fw = codecs.open("vocabulary.txt", "w", "utf-8")
        for word in vocab:
            fw.write(word + "\n")
        fw.close()

        return dtm, vocab
コード例 #12
0
ファイル: ner.py プロジェクト: ayat-rashad/stocks_news
client = pymongo.MongoClient('mongodb://localhost:27017')
db = client['yahoofinance_news']
news = list(db['news'].find({}))

path = 'stanford-ner-2015-04-20/stanford-ner.jar'
os.environ['STANFORD_MODELS'] = 'stanford-ner-2015-04-20/classifiers'
st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz', path, java_options='-mx2g') 

def find_orgs(token_tags):
        nes = groupby(token_tags, key=lambda d: d[1])
        nes2 = []

        for k, v in nes:
            if k == 'ORGANIZATION':
                nes2.append(' '.join([t[0] for t in v]))
        return nes2
    

doc_tokens = [wordpunct_tokenize(n['content']) for n in news]
nes = map(find_orgs, st.tag_sents(doc_tokens))
nes = map(np.unique, nes)

requests = []
for n, ne in zip(news, nes):
    requests.append(UpdateOne({'_id':n['_id']}, {"$set":{'nes':ne}}))

db['news'].bulk_write(requests)
    
                            
コード例 #13
0
class NerAnalysis:
    def __init__(self):
        self.dict_of_dicts = {}
        self.tokenizer = nltk.tokenize.TweetTokenizer()

        config = configparser.ConfigParser()
        config.read("./config.ini")
        folder = config['NER']['stanford_ner_folder']
        self.stanford_tagger = StanfordNERTagger(
            folder + '\classifiers\english.muc.7class.distsim.crf.ser.gz',
            folder + '\stanford-ner.jar',
            encoding='utf-8')

    def tag_text(self, text):
        sentences = nltk.sent_tokenize(text)
        tokenized_sentences = [
            self.tokenizer.tokenize(sent) for sent in sentences
        ]
        classified_sentences = self.stanford_tagger.tag_sents(
            tokenized_sentences)
        list_to_return = []
        for i in range(len(classified_sentences)):
            classified_sent = classified_sentences[i]
            sentence = sentences[i]
            result = self.process_sent(classified_sent, sentence)
            list_to_return.append(result)
        return list_to_return

    def tokenize_text(self, text):
        sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        sentences = sentence_tokenizer.tokenize(text)
        return sentences

    def process_sent(self, classified_sentence, sentence):
        sentence_with_info = []
        sentence_part = sentence
        # tokenization
        tokens = self.tokenizer.tokenize(sentence)
        num_of_tokens = len(classified_sentence)
        for index, token_with_tag in enumerate(classified_sentence):
            word = token_with_tag[0]
            ner_tag = token_with_tag[1]

            # if len(word) == 1 and not word.isalpha():
            # 	pos_tag = "SYM"

            if ner_tag == 'O' or (ner_tag != 'O' and not word.isalpha()):
                if (index == num_of_tokens - 1):
                    tup = sentence_part, None
                    sentence_with_info.append(tup)
                continue

            token_position_final = None
            sentence_splitting = sentence_part
            current_len = 0
            while True:
                token_position = sentence_splitting.find(word)
                # checking after found position
                good_after = False
                if (token_position + len(word) < len(sentence_splitting)):
                    char_after_token = sentence_splitting[token_position +
                                                          len(word)]
                    if (char_after_token.isalpha() == False):
                        good_after = True
                else:
                    good_after = True

                # checking before found position
                good_before = False
                if token_position > 0:
                    char_before_token = sentence_splitting[token_position - 1]
                    if (char_before_token.isalpha() == False):
                        good_before = True
                else:
                    # nothing before the token, so it is a full word
                    good_before = True

                if good_before and good_after:
                    token_position_final = token_position + current_len
                    break

                current_len = token_position + 1
                sentence_splitting = sentence_splitting[(token_position + 1):]

            # at this point we have the position
            # "token_position_final" where the token was found

            # split_sentence is a list of parts
            # which are split around the found token
            split_sentence = sentence_part.split(word, 1)
            part_before = sentence_part[0:token_position_final]
            part_after = sentence_part[(token_position_final + len(word)):]

            # pre token part
            tup = part_before, None
            sentence_with_info.append(tup)

            # token and its tag
            tup = word, ner_tag
            sentence_with_info.append(tup)

            sentence_part = part_after

        return sentence_with_info
コード例 #14
0
ファイル: ner.py プロジェクト: parisa-abolfathbeygi/CDPD
	# print 'l:____________', type(l)
	# text = str(' '.join(l))
	# print text
	# sent_list = tokenizer.tokenize(text)

	# print 'sentence list___________'
	# print sent_list

	# a = [sent.split() for sent in sent_list]

	# print 'aaaaaaaaaaaa'
	# print a
	# tagged_sents = st.tag_sents(a)
	# print tagged_sents

	begin = time.time()

	predicted_tagged_corpus = []
	for tagged_body in tagged_bodies:
		body = word_list(tagged_body)
		sentences_list = tokenizer.tokenize(str(' '.join(body)))
		sent_list_splitted = split_sentence_list(sentences_list)
		predicted_tagged_body = st.tag_sents(sent_list_splitted)
		predicted_tagged_corpus.append(predicted_tagged_body)


	print predicted_tagged_corpus[0]
	print predicted_tagged_corpus[-1]

	print time.time() - begin, ' seconds!' 
コード例 #15
0
class EnPreprocesser(preprocesser.Preprocesser):
    def __init__(self,
                 strip_accents="unicode",
                 lowercase=True,
                 remove_html=True,
                 join_urls=True,
                 use_bigrams=True,
                 use_ner=True,
                 stanford_ner_path="",
                 use_lemmatizer=False,
                 use_stemmer=False):

        self.stanford_ner_path = stanford_ner_path  # path to stanford NER
        self.strip_accents = strip_accents  # options: {‘ascii’, ‘unicode’, None}
        self.lowercase = lowercase
        self.remove_html = remove_html
        self.join_urls = join_urls
        self.use_bigrams = use_bigrams
        self.use_ner = use_ner
        self.use_lemmatizer = use_lemmatizer  # use lemmatizer instead of stemmer?
        self.use_stemmer = use_stemmer

        # self.stanford_corenlp = StanfordCoreNLP(self.stanford_corenlp_path, memory="8g")
        self.sentence_splitter = PunktSentenceTokenizer(
        ).tokenize  # Punkt sentence splitter
        self.stemmer = SnowballStemmer("english").stem  # Snowball stemmer
        self.lemmatizer = WordNetLemmatizer().lemmatize  # WordNet lemmatizer
        self.base_tokenizer = CountVectorizer().build_tokenizer(
        )  # sklearn tokenizer works the best, I think...
        self.stop_words = stopwords.words(
            "english")  # nltk list of 128 stopwords
        self.token_pattern = re.compile(
            r"(?u)\b(\w*[a-zA-Z_]\w+|\w+[a-zA-Z_]\w*)\b"
        )  # default value was r"(?u)\b\w\w+\b"
        self.numeric_pattern = re.compile(r"^[0-9]+$")  # number regex
        self.url_pattern = re.compile(r"((http://)?(www\..*?\.\w+).*?)\s")
        self.compound_pattern = re.compile(r"\w+(\-\w+)+")

        if self.use_lemmatizer:
            self.tokenizer = CustomTokenizer(self.base_tokenizer,
                                             self.lemmatizer,
                                             self.token_pattern,
                                             self.numeric_pattern)
        elif self.use_stemmer:
            self.tokenizer = CustomTokenizer(self.base_tokenizer, self.stemmer,
                                             self.token_pattern,
                                             self.numeric_pattern)
        else:
            self.tokenizer = CustomTokenizer(self.base_tokenizer, lambda x: x,
                                             self.token_pattern,
                                             self.numeric_pattern)

    def find_nbest_bigrams(self, corpus, n, metric, min_freq):
        """
        Find the top-N most frequently occurring bigrams within the corpus.
        """
        print("\nfinding top-%d bigrams using %s..." % (n, metric))
        alltokens = []
        simplerTokenizer = CustomTokenizer(self.base_tokenizer, lambda x: x,
                                           re.compile(".*"), re.compile("^$"))
        for doc in corpus:
            for token in [t for t in simplerTokenizer(doc)]:
                alltokens.append(token)
        bigram_measures = nltk.collocations.BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(alltokens)
        finder.apply_freq_filter(
            min_freq)  # bigrams must appear at least 5 times
        if metric.lower() == "pmi":
            best_bigrams = finder.nbest(bigram_measures.pmi,
                                        n)  # doctest: +NORMALIZE_WHITESPACE
        elif metric.lower() == "chi_sq":
            best_bigrams = finder.nbest(bigram_measures.chi_sq,
                                        n)  # doctest: +NORMALIZE_WHITESPACE
        else:
            raise Exception("Unknown metric for bigram finder")
        return best_bigrams

    def remove_punctuation(self, text):
        """
        Remove punctuation.
        """
        return "".join(re.findall(r"[a-zA-Z0-9\s]+", text))
        # return "".join(re.findall(r"[a-zA-Z0-9,.;!:'?\s]+", tokens))
        # return tokens

    def tag_corpus_ner(self, corpus):
        """
        Tag named entitties in corpus with stanfordNER toolkit
        """

        if not hasattr(self, "stanford_ner"):
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore", category=DeprecationWarning)
                # import imp
                self.stanford_ner = StanfordNERTagger(
                    self.stanford_ner_path +
                    "classifiers/english.conll.4class.distsim.crf.ser.gz",
                    self.stanford_ner_path + "stanford-ner.jar")
                self.stanford_ner._stanford_jar = self.stanford_ner_path + "stanford-ner.jar:" + self.stanford_ner_path + "lib/*"

        print("splitting sentences in corpus (for NER)...")
        corpus_sentences = []
        sentence_to_doc_map = {}
        sent_no = 0
        for d in tqdm(range(len(corpus))):
            # print("\r%s " % d, end="")
            for sent in self.sentence_splitter(corpus[d]):
                corpus_sentences.append(sent)
                sentence_to_doc_map[sent_no] = d
                sent_no += 1
        tokenized_sentences = []
        for sent in corpus_sentences:
            tokenized_sentences.append(
                [t for t in re.split(r"\s+", sent) if len(t) > 0])
            #tokenized_sentences = [re.split(r'\s+', sent) for sent in corpus_sentences]

        print("tagging sentences with Stanford NER...")
        tagged_sentences = []
        for batch in tqdm(range(self.ner_batch)):
            # print("\r%s/%s tagging sentences with Stanford NER..." % (batch, self.ner_batch), end="")
            chunk = int(len(corpus) / self.ner_batch)
            tagged_sentences += self.stanford_ner.tag_sents(
                tokenized_sentences[batch * chunk:(batch + 1) * chunk])
        # process NER output
        tagged_corpus = []
        current_doc_no = 0
        current_doc = []
        for i in range(len(tagged_sentences)):
            doc_no = sentence_to_doc_map[i]
            if doc_no == current_doc_no:
                current_doc += tagged_sentences[i]
            else:
                tagged_corpus.append(current_doc)
                current_doc = []
                current_doc_no = doc_no
        tagged_corpus.append(current_doc)

        # get dictionary of named entities per document
        named_entities = []
        for tagged_doc in tagged_sentences:
            tags = {}
            current_ne = []
            for token, tag in tagged_doc:
                if current_ne:
                    if tag == "O" or (tag != "O" and tag != current_ne[-1][1]):
                        tags[" ".join([t for t, _ in current_ne
                                       ])] = current_ne[0][1]
                        current_ne = []
                if tag != "O":
                    current_ne.append((token, tag))
            if current_ne:
                tags[" ".join([t for t, _ in current_ne])] = current_ne[0][1]
            named_entities.append(tags)
        return tagged_sentences, named_entities

    def preprocess_corpus(self, corpus):
        """
        Preprocess the corpus.
        """
        self.ner_batch = int(math.ceil(len(corpus) / 5000))
        print("preprocessing corpus...")
        print("corpus size: %i, ner_batch=%i" % (len(corpus), self.ner_batch))

        # first pass over the corpus: prepare for NER
        print("first pass over the corpus...\n\tunescape characters")
        if self.remove_html: print("\tremove html")
        if self.strip_accents: print("\tstrip accents")
        if self.join_urls: print("\tjoin URLs")
        print("\tjoin compound words\n\tspace out punctuation")

        for d in tqdm(range(len(corpus))):
            corpus[d] = html.unescape(corpus[d]) + " "

            if self.remove_html:
                corpus[d] = self.remove_html_tags(corpus[d])
            if self.strip_accents == "unicode":
                corpus[d] = self.strip_accents_unicode(corpus[d])
            if self.join_urls:
                corpus[d] = self.join_urls_to_token(corpus[d],
                                                    self.url_pattern)
            corpus[d] = self.join_compound_words(corpus[d],
                                                 self.compound_pattern)
            corpus[d] = self.space_out_punctuation(corpus[d])
            # print("\r\t%s" % d,  end="")

        if self.use_ner:
            tagged_corpus, named_entities = self.tag_corpus_ner(corpus)

            # debug NER
            fw = codecs.open("debug_NER.txt", "w", "utf-8")
            for tags in named_entities:
                fw.write("%s\n" % list(tags.items()))
            fw.close()

            print("\nmerging named entities as single tokens...")
            for d in tqdm(range(len(tagged_corpus))):
                tags = named_entities[d]
                for ne in tags:
                    corpus[d] = corpus[d].replace(ne, re.sub(r"\s+", "", ne))
                # print("\r%s " % d,  end="")

        # second pass over the corpus: remove punctuation and convert to lowercase
        # (these were useful above for NER, but now can be removed)
        print("\nsecond pass over the corpus...")
        if self.lowercase: print("\tconvert to lowercase")
        print("\tremove punctuation")
        for d in tqdm(range(len(corpus))):
            corpus[d] = self.remove_punctuation(corpus[d])
            if self.lowercase:
                corpus[d] = corpus[d].lower()
            # print("\r\t%s" % d,  end="")

        if self.use_bigrams:
            # find top N bigrams
            # best_bigrams = self.find_nbest_bigrams(corpus, 100, "pmi", 10)
            best_bigrams = self.find_nbest_bigrams(corpus, 100, "chi_sq", 10)

            # debug bigrams
            fw = codecs.open("debug_bigrams.txt", "w", "utf-8")
            for w1, w2 in best_bigrams:
                fw.write(w1 + " " + w2 + "\n")
            fw.close()

            print("\n")
            for d in range(len(corpus)):
                print("\r%s merging bigrams as single tokens..." % d, end="")
                for w1, w2 in best_bigrams:
                    corpus[d] = corpus[d].replace(w1 + " " + w2, w1 + w2)

        return [sent for sent in corpus]

    # helper functions
    def strip_accents_unicode(self, text):
        return "".join([
            c for c in unicodedata.normalize("NFKD", text)
            if not unicodedata.combining(c)
        ])

    def remove_html_tags(self, text):
        return re.sub(r"( ?\.+ )+", " . ", re.sub(r"<[^>]*>", " . ", text))

    def join_urls_to_token(self, text, url_pattern):
        m = re.search(url_pattern, text)
        while m:
            text = re.sub(url_pattern,
                          m.group(3).replace("http://", "").replace(".", ""),
                          text)
            m = re.search(url_pattern, text)
        return text

    def join_compound_words(self, text, compound_pattern):
        m = re.search(compound_pattern, text)
        while m:
            text = re.sub(m.group(0), m.group(0).replace("-", ""), text)
            m = re.search(compound_pattern, text)
        return text

    def space_out_punctuation(self, text):
        text = re.sub(r",\s", " , ", text)
        text = re.sub(r"\.\.\.\s", " ... ", text)
        text = re.sub(r"\.", " . ", text)
        text = re.sub(r";\s", " ; ", text)
        text = re.sub(r":\s", " : ", text)
        text = re.sub(r"\?\s", " ? ", text)
        text = re.sub(r"!\s", " ! ", text)
        text = re.sub(r"\"", " \" ", text)
        text = re.sub(r"\'", " \' ", text)
        text = re.sub(r"\s\(", " ( ", text)
        text = re.sub(r"\)\s", " ) ", text)
        text = re.sub(r"\s\[", " [ ", text)
        text = re.sub(r"\]\s", " ] ", text)
        text = re.sub(r"-", " - ", text)
        text = re.sub(r"_", " _ ", text)
        text = re.sub(r"\n", " ", text)
        text = re.sub(r"\r", " ", text)
        text = re.sub(r"\s+", " ", text)
        tokens = self.tokenizer(text)
        tokens = " ".join(tokens)
        return tokens
コード例 #16
0
ファイル: mapper.py プロジェクト: sksq/prodigy
# for each line from stdin
for line in stdin:
	try:
		# load json-tweet
		tweet = loads(line)
		tweetText = tweet['text']
		
		# tokenize tweet-text
		listOfWords = word_tokenize(tweetText)
		listOfListOfWords.append(listOfWords)
	
	except:
		pass

# StandfordNER Instance 
nerClf = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
nerPair = nerClf.tag_sents(listOfListOfWords)

# word is location and greater than 2 character
locations = []
for ner in nerPair:
	for word, nerType in ner:
		if nerType == 'LOCATION' and len(word) > 2:
			locations.append(word.lower())

for location in locations:
	print((location, frequency))


コード例 #17
0
 if originalSize < 5000000 and originalSize > 1000:  
     sentences = sent_tokenize(text)
     for sent in sentences:
         if (re.search('([A-Z]\w+ [1-9]*(1[0-9])*(2[0-9])*(3[0,1])*, \d{4})', sent))and is502:
             sentences_with_date.append(sent)
             words = pos_tag(word_tokenize(sent))
             for word in words:
                 if word[1] == "VB": 
                     sets = wn.synsets(word[0]) 
                     for s in sets: 
                         for w in wordList:
                             if w.path_similarity(s) > 0.3:
                                 relevant_sents.append(sent)
                         
                                         
     sents = st.tag_sents([word_tokenize(sent) for sent in sentences]) 
     for classedSent in sents:
         for word in classedSent:
             if'PERSON' in word[1] and not previousPerson:
                 i+=1
                 names.append(word[0])
                 previousPerson = True
             elif 'PERSON' in word[1]:
                 names[i] = names[i]+" " + word[0]
                 previousPerson = True
             else:
                 previousPerson = False
 elif originalSize > 1000: 
         chunks = (text[0+i:10000+i] for i in range(0, len(text), 10000))
         for chunk in chunks:
                 sentences = sent_tokenize(chunk)
コード例 #18
0
        if ("in a " in str(sentence)):
            string = str(sentence)
            index = string.index("in a ") + len("in a ")
            string = string[index:]
            word = nltk.word_tokenize(string)[0]
            places.append(word)

        for word, pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):
            if (pos == 'NNS'):
                if (word not in entities):
                    entities.append(word)
            if ('VB' in pos):
                if (word not in verbs):
                    verbs.append(word)

classified_text = st.tag_sents(tokenized_sents)

for item in classified_text:
    for x, y in item:
        if (y == 'PERSON'):
            if (x not in persons):
                persons.append(str(x))

print(entities)
print(len(entities))
print(persons)
print(len(persons))

outfile = open("persons", "wb")
pickle.dump(entities, outfile)
outfile.close()
コード例 #19
0
class EmailGraph:
    #http://py2neo.org/2.0/intro.html#nodes-relationships
    #Creates a New Graph (You will Need to Update this Function for your own install)
    def __init__(self, user, pwrd):
        authenticate("localhost:7474", user, pwrd)
        self.graph = Graph("http://localhost:7474/db/data/")
        java_path = "C:\ProgramData\Oracle\Java\javapath\java.exe"
        os.environ['JAVAHOME'] = java_path
        self.st = StanfordNERTagger('C:\stanford-ner-2015-12-09\classifiers\english.conll.4class.distsim.crf.ser.gz',\
                      'C:\stanford-ner-2015-12-09\stanford-ner.jar')
        self.stop_words = nltk.corpus.stopwords.words('english')
        self.legal_words = {"section","fw","re","ops","fyi","doc no","case no","subtitle","btw","usc","foia","chapter","u.s.c",\
               "report","attachment","attachments","note","amended", "ebook","subject","unclassified department of state case","doc",\
               "unclassified u.s. department of state","original message","project", "copyright", "pls", "you","u.s. department of state case no"}

    #process email: removes some of the headings before looking for keywords
    def process_email(self, email):
        processed = ""
        for line in email.split('\n'):
            s = line.lower()
            if s.startswith("unclassified u.s. department of state") or \
               s.startswith("release in") or \
               s.startswith("original message") or \
               s.startswith("to:") or \
               s.startswith("from:") or \
               s.startswith("sent:") or  \
               s.startswith("cc:"):
                pass
            else:
                if len(line) > 0 and line[-1] == '.':
                    processed = processed + line + ' '
                else:
                    processed = processed + line + '. '
        return processed

    #filter_by_contents: receives a list of noun_phrases and filters out phrases contained in longer phrases elsewhere in the list
    def filter_by_contents(self, noun_phrases):
        in_others = []
        for i, candidate in enumerate(noun_phrases):
            for j, other in enumerate(noun_phrases):
                if i != j:
                    if candidate[0].lower() in other[0].lower() and candidate[
                            0] != other[0]:  #compare each phrase with another
                        in_others.append(candidate)
        #filter out our identified 'duplicate' words and stopwords.
        filtered_words = [w for w in noun_phrases if w not in in_others and \
                          w[0].lower() not in self.legal_words and w[0].lower() not in self.stop_words]

        #create a Frequency Distribution
        unigram_fd = nltk.FreqDist(filtered_words)
        #get the most common phrases
        common_noun_phrases = unigram_fd.most_common(20)

        result = []
        words = set([w[0][0].lower() for w in common_noun_phrases])
        for w in words:
            best_match = None
            for phrase in common_noun_phrases:
                if phrase[0][0].lower() == w:
                    if best_match is None:
                        best_match = phrase
                    else:
                        best_match = (best_match[0], best_match[1] + phrase[1])
            result.append(best_match)
        return sorted([w for w in result], key=lambda w: w[1], reverse=True)

    #filter_by_hypernym: receives a list of candidates and finds the best hypernym for each.
    #I started with code by Anna Swigart, ANLP 2015, and her concept of using a dictionary to store
    #terms from WordNet, however this code drastically departs from her algorithm.
    def filter_by_hypernym(self, candidates):
        #create a dictionary
        results = []
        for term in candidates:  #loop through list of candidates
            synsets = wn.synsets(term[0][0],
                                 'n')  #obtain the synsets for the phrase
            if len(synsets) >= 1:
                hypers = synsets[0].hypernyms(
                ) + synsets[0].instance_hypernyms()
                if len(hypers) >= 1:
                    results.append(((term[0][0],
                                     hypers[0].name().split('.')[0]), term[1]))
                else:
                    results.append(term)
            else:
                results.append(term)
        return results

    #algorithm for extracting key words from an email body
    def final_algorithm(self, email):
        #Create Sentences
        sentences = nltk.sent_tokenize((self.process_email(email)))
        tokenized_sentences = []
        for s in sentences:
            #get the tokens for each sentence that are filtered
            tokenized_sentences.append([word for word in nltk.word_tokenize(s) \
                                        if not re.search('[0-9]', word) and word.lower() not in self.legal_words and len(word) > 2])

        #separate the NER tagged entities from the rest
        def get_entities(tags):
            result = []
            curr = []
            for ent in tags:
                if ent[1] == 'O':
                    if len(curr) > 0:
                        result.append(curr)
                        curr = []
                else:
                    if len(curr) > 0:
                        if not curr[0][1] == ent[1].lower():
                            result.append(curr)
                            curr = [(ent[0], ent[1].lower())]
                        else:
                            curr = curr + [(ent[0], ent[1].lower())]
                    else:
                        curr = [(ent[0], ent[1].lower())]
            return result

        #NER tag each of the sentences
        tagged_sents = self.st.tag_sents(tokenized_sentences)
        entity_names = []
        for s in tagged_sents:
            entity_names = entity_names + get_entities(s)

        #reorganize the entities for further processing
        def compress_entities(entities):
            new_list = []
            for entity in entities:
                result = " ".join([w[0] for w in entity])
                new_list.append((result, entity[0][1]))
            return new_list

        entity_names = compress_entities(entity_names)
        #print(entity_names)

        # Print unique entity names
        noun_phrases = entity_names

        #Candidates Filtered by Duplicate Nouns and Rescored by Length
        noun_phrases = self.filter_by_contents(noun_phrases)
        #print(noun_phrases)

        #Candidate with better categories/hypernyms!
        noun_phrases = self.filter_by_hypernym(noun_phrases)

        #print("Email:\n" + email)
        print("Key Phrases:\n" + str(noun_phrases))

        return noun_phrases

    #clears out a graph
    def delete(self):
        self.graph.delete_all()

    #checks to see if a node exists in a graph
    #http://stackoverflow.com/questions/22134649/how-to-check-if-a-node-exists-in-neo4j-with-py2neo
    def find_existing(self, label, key, value):
        mynode = list(
            self.graph.find(label, property_key=key, property_value=value))
        # node found
        if len(mynode) > 0:
            return mynode[0]
        # no node found
        else:
            return None

    #adds a new 'email' data element to the graph
    #code based on http://py2neo.org/2.0/intro.html#nodes-relationships
    def add_to_graph(self, data_element, terms):

        #['Id', 'DocNumber', 'MetadataSubject', 'MetadataTo', 'Metadata From',
        #'MetadataDateSent', 'ExtractedSubject', 'ExtractedTo',
        #'ExtractedFrom', 'ExtractedBodyText','RawText', 'Label']]
        email_id = data_element['DocNumber']
        email_feeling = data_element['NewLabel']
        email = self.find_existing("Email", "docid", email_id)
        if email is None:
            if str(email_feeling) == '1':
                email_feelstr = 'emotional'
                n = 'E'
            else:
                email_feelstr = 'neutral'
                n = 'N'
            email = Node("Email", name = n, docid = email_id, tone=email_feelstr,\
                subject=data_element["ExtractedSubject"], date=data_element['MetadataDateSent'])
        s = email

        #add From nodes
        from_id_all = data_element['ExtractedFrom']
        if type(from_id_all) is str:
            for from_id_i in from_id_all.split(';'):
                from_id = from_id_i.strip().strip('\'')
                sender = self.find_existing("User", "address", from_id)
                if sender is None:
                    sender = Node("User", address=from_id)
                s = s | Relationship(sender, "SENT", email)

        #add To nodes
        to_id_all = data_element['ExtractedTo']
        if type(to_id_all) is str:
            for to_id_i in to_id_all.split(';'):
                to_id = to_id_i.strip().strip('\'')
                receiver = self.find_existing("User", "address", to_id)
                if receiver is None:
                    receiver = Node("User", address=to_id)
                s = s | Relationship(receiver, "RECEIVED", email)

        #add Emotion nodes
        emote_all = data_element['Emotions']
        #print(emote_all)
        if type(emote_all) is str:
            print("Emotions: " + str(emote_all))
            for emote in emote_all.split(';'):
                if len(emote) > 0:
                    emotion = self.find_existing("Emotion", "name", emote)
                    if emotion is None:
                        emotion = Node("Emotion", name=emote)
                    s = s | Relationship(email, "EMOTED", emotion)

        self.graph.create(s)

        #add keywords and categories
        for item in range(0, len(terms)):
            keyword = terms[item][0][0]
            category = terms[item][0][1]
            n = self.find_existing("Keyword", "name", keyword)
            if n is None:
                n = Node("Keyword", name=keyword)
            s = Relationship(email, "MENTIONS", n)

            c = self.find_existing("Category", "name", category)
            if c is None:
                c = Node("Category", name=category)
            s = s | Relationship(n, "IS_TYPE_OF", c)
            self.graph.create(s)

    #get_random_emails - returns a number of random emails from a given data frame
    def get_random_emails(self, data_set, number):
        random_index = np.random.permutation(data_set.index)
        full_data_shuffled = data_set.ix[random_index,\
        ['Id', 'DocNumber', 'MetadataSubject', 'MetadataTo', 'Metadata From', 'MetadataDateSent',\
            'ExtractedSubject', 'ExtractedTo', 'ExtractedFrom','ExtractedBodyText','RawText',\
            'NewLabel', 'Emotions']]
        full_data_shuffled.reset_index(drop=True, inplace=True)
        #separate the training data from the development data
        return full_data_shuffled.loc[0:number - 1]

    #adds a specified number of emails from a dataset
    def add_new_emails(self, num, total_df):
        selected_emails = self.get_random_emails(total_df, num)
        selected_emails["MetadataDateSent"].fillna(value='<blank>',
                                                   inplace=True)
        selected_emails["ExtractedSubject"].fillna(value='<blank>',
                                                   inplace=True)
        data_list = selected_emails["RawText"].values.tolist()
        subject_list = selected_emails["ExtractedSubject"].values.tolist()
        printable = set(string.printable)

        #for each email, extract the key words and then add to the graph
        for index in range(0, num):
            s = "".join(filter(lambda x: x in printable, data_list[index])) + ' . ' +\
                "".join(filter(lambda x: x != '<blank>' and x in printable, subject_list[index]))
            terms = self.final_algorithm(s)
            self.add_to_graph(selected_emails.loc[index], terms)