Python RegexpTokenizer.span_tokenizeの例、nltk.tokenize.RegexpTokenizer.span_tokenize Pythonの例

コード例 #1

0

ファイルを表示

def calc_words_metadata(s):
    tokenizer = RegexpTokenizer(u'[а-яё]+')

    mystem_gr_tokenizer = RegexpTokenizer(mystem_gr_tokens)
    mystem_gr_vectorizer = CountVectorizer(
        tokenizer=mystem_gr_tokenizer.tokenize,
        vocabulary=mystem_gr_vocab,
        binary=True)

    token_coords = tokenizer.span_tokenize(s)

    raw_gr_descs = []
    word_indices = []
    for i in token_coords:
        tn = s[i[0]:i[1]]
        if not tn in mystemmer_cache:
            d = mystemmer.analyze(tn)
            if len(d) < 1 or not 'analysis' in d[0] or len(
                    d[0]['analysis']) < 1 or not 'gr' in d[0]['analysis'][0]:
                ext_gr_string = ''
            else:
                ext_gr_string = d[0]['analysis'][0]['gr']
            mystemmer_cache[tn] = ext_gr_string
        gr_string = mystemmer_cache[tn]
        if 0 == len(gr_string):
            continue
        if i[1] < len(s):
            word_indices.append(i[1])
            raw_gr_descs.append(gr_string)
    rows = mystem_gr_vectorizer.fit_transform(raw_gr_descs)
    rows = rows.toarray()
    result = {i[0]: i[1] for i in zip(word_indices, rows)}
    #del mystemmer
    return result

コード例 #2

0

ファイルを表示

class ProcessorTokenizerNltkEn:
    """Performs tokenization of English texts.
    
    Wrapper around NLTK RegexpTokenizer.
    """
    def __init__(self, delay_init=False, *args, **kwargs):
        self._proc = None
        if not delay_init:
            self.init(*args, **kwargs)

    def init(self, abbrevs=_en_abbrevs):
        if self._proc is None:
            _en_regex = '|'.join(abbrevs + _ru_rules)
            self._proc = RegexpTokenizer(_en_regex)

    def __call__(self, text):
        """Performs tokenization of text.
        
        Args:
            text(str): raw text.
            
        Returns:
            List of Token objects.
        """

        return [
            Token(text[start:end], start, end)
            for (start, end) in self._proc.span_tokenize(text)
        ]

コード例 #3

0

ファイルを表示

def get_tokens(text, spellcheck=False):
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = PorterStemmer()

    # Basic
    tokens = [(s, text[s:e].lower()) for s, e in tokenizer.span_tokenize(text)]
    if spellcheck: tokens = [(i, correct(word)) for i, word in tokens]
    tokens = [(i, stemmer.stem(word)) for i, word in tokens]

    # Stop words and bigrams
    en_stopwords = stopwords.words('english')
    prev_token = None
    new_tokens = []
    for i, word in tokens:
        if word in en_stopwords:
            prev_token = None
        else:
            new_tokens.append((i, word))
            if prev_token is not None:
                prev_i, prev_word = prev_token
                new_tokens.append((prev_i, prev_word + ' ' + word))
            prev_token = i, word
    tokens = new_tokens

    return tokens

コード例 #4

0

ファイルを表示

ファイル: data_loading.py プロジェクト: sagahansson/lt2316-h20-aa2

 def string_to_span(self, s):
     # creates a tokenized version and a span version of a string
     # helper to parse_xmls
     punctuation = "-,.?!:;"
     tokenizer = RegexpTokenizer("\s|:|;", gaps=True)
     tokenized = tokenizer.tokenize(s.lower())
     tokenized = [word.strip(punctuation) if word[-1] in punctuation else word for word in tokenized] # removing punctuation if it's the last char in a word
     span = list(tokenizer.span_tokenize(s)) # gets the pythonic span i e (start, stop_but_not_including)
     new_span = []
     for tpl in span:
         new_span.append((tpl[0], (tpl[1]-1))) # to get non-pythonic span i e (start,last_char)
     return new_span, tokenized

コード例 #5

0

ファイルを表示

ファイル: Scrape+Indeed.py プロジェクト: mccuecode/project1

def highlight_phrases_from_list(t):

    # tokenize the text of the description, with spans
    tokenizer = RegexpTokenizer(r'[a-zA-Z0-9#+-]+')
    span_generator = tokenizer.span_tokenize(t)
    spans = [span for span in span_generator]
    tokens = [t[span[0]:span[1]] for span in spans]
    
#     # create a dictionary of the words, with spans as the values
#     # and another dictionary with the same keys, with the word indexes as the values
    char_span = defaultdict(list)
    word_index = defaultdict(list)
    for i, (k, span) in enumerate(zip(tokens, spans)):
        char_span[k].append(span)
        word_index[k].append(i)
    # is this useful?
    df = pd.DataFrame({'Character Index Spans': pd.Series(char_span), 'Word Indexes': pd.Series(word_index)})

    highlight_spans = []   
    for skill_phrase in skill_phrase_wl:
        if word_index.get(skill_phrase[0]):
            for i, occurence in enumerate(word_index.get(skill_phrase[0])):
                if all((skill_phrase[j] == tokens[j+occurence]) for j in range(len(skill_phrase))):
                    highlight_span = (spans[occurence][0], spans[occurence + len(skill_phrase) - 1][1])
                    highlight_spans.append (highlight_span)

# # look up the words in our skill list in the dictionary.  List the findings as spans to be highlighted
#     for skill in single_word_skills:
#         highlight_spans += char_span[skill]

    # Sort the spans to be highlighted
    highlight_spans.sort()

    # Insert html tags to highlight the keywords
    html_start_tag = '<font color="red">'
    html_end_tag = '</font>'
    highlighted = ''
    cursor = 0
    for span in highlight_spans:
        if (span[0] > cursor): # go forwards only, not backwards 
            if (cursor>0):
                highlighted += html_end_tag
            highlighted += t[cursor:span[0]] +                             html_start_tag +                             t[span[0]:span[1]]
        elif (span[1] > cursor):
            highlighted += t[cursor:span[1]]
        cursor = span[1]
    highlighted += html_end_tag + t[cursor:]
    display(HTML(highlighted))

コード例 #6

0

ファイルを表示

class ProcessorTokenizerRu:
    """Performs tokenization of Russian texts with regexes.
    
    Wrapper around NLTK RegexpTokenizer. Supports Russian abbreviations.
    """
    def __init__(self, delay_init=False):
        self._proc = None
        if not delay_init:
            self.init()

    def init(self):
        if self._proc is None:
            self._proc = RegexpTokenizer(_ru_regex, flags=re.IGNORECASE)

    def __call__(self, text):
        assert self._proc
        return [
            Token(text[start:end], start, end)
            for (start, end) in self._proc.span_tokenize(text)
        ]

コード例 #7

0

ファイルを表示

ファイル: EmailParser.py プロジェクト: devsjee/Skedool

	def tokenize_mail(self,mailtext):
		"""
		this function uses the RegexpTokeniser to split the mail text on the pattern defined
		Each split is a separate mail. 
		Returns a list of mails contained in the given mailtext
		"""
		
		mails = []
	
		#splits entire mail into parts matching the 'On <Date Time> <*****@*****.**> wrote:' pattern
		tokenizer = RegexpTokenizer('\n[>|\s]*On[\s]* ([a-zA-Z0-9, :/<>@\.\"\[\]\r\n]*[\s]* wrote:)',gaps = True)
		mail_indices = tokenizer.span_tokenize(mailtext)
		
		#uses the splits' offset information from span_tokenize to split the actual mailcontent
		#stores each split as an element of a list named 'mails'
		start = end = 0
		for index in mail_indices:
			end = index[1]+1
			mails.append(mailtext[start:end])
			start = end

		return mails	#list of the contained mails within a single mailtext

コード例 #8

0

ファイルを表示

ファイル: index.py プロジェクト: ChipaKraken/iRnWs

class IndexFile(object):
    """docstring for IndexFiles"""
    def __init__(self, id, doc):
        self.tokenizer = RegexpTokenizer(r'\w+')
        self.stopwords = set(stopwords.words('english'))
        self.stemmer = EnglishStemmer()
        self.doc = doc
        self.id = id

    def get_data(self):
        for start, end in self.tokenizer.span_tokenize(self.doc):
            token = self.doc[start:end].lower()
            if token in self.stopwords:
                continue
            words = Words(name=token)
            if word_exists(words.name):
                words = session.query(Words).filter_by(name=words.name).first()
            else:
                session.add(words)
                session.commit()
            index = Index(doc_id=self.id, word_id=words.id, beg=start)
            session.add(index)
        session.commit()

コード例 #9

0

ファイルを表示

    def generate_feedback(self, methods: list, gazetteers: list):
        """
        Generates feedback according to the medical entities found in the essay with the specified methods and gazetteers
        Feedback can be found at FEEDBACK_PATH
        :param methods: list of methods used for annotation
        :param gazetteers: list of gazetteers that should be taken into account
        """
        # all Feedback transmitted to CASUS is saved here
        feedbackList = []
        feedbackCount = 0

        # will be printed in CSV feedback stats file with 1st column the essayID
        # then for each feedback there are 3 columns:
        # 0 / 1 (feedback given / not given)
        # specified terms (main term or synonym associated with found term)
        # found terms
        # specified and found terms of multiple conditions are seperated with "&",
        # various found terms or specified synonyms with ";"
        feedbackStats = [self.essay_name]
        # count of the feedback (= line in feedback table)
        statscount = 0

        # save feedback to a file
        with open(
                self.file_path + "/" + self.essay_name.split(".")[0] +
                "_feedback.txt", "w") as feedback_f:

            tokenizer = RegexpTokenizer(
                r'[\w-]+'
            )  # finds all strings with alphanumerical characters or with '-' or '_' (the latter implicitly through \w)
            token_offsets = list(tokenizer.span_tokenize(self.essay))

            # check for each feedback whether it fulfills the constraints (terms existing or missing)
            for (terms_main, terms_synonyms, feedback,
                 feedbackType) in self.case_feedback[self.caseId]:

                fulfilled, useful_terms_main_advanced, useful_terms_synonyms_advanced = \
                    self.check_conditions(terms_main, terms_synonyms, methods, gazetteers)

                #print("terms: " + str(useful_terms_main_advanced))
                #print("syn: " + str(useful_terms_synonyms_advanced))

                # if all conditions are fulfilled (e.g. MRT & NOT Roentgen are two conditions)
                if all(x == True for x in fulfilled):
                    #print("fulfilled")

                    # each fulfilled feedback will be transmitted to CASUS with the following information:
                    # (feedbackID, set of offsets of found terms, set of associated strings, feedback text)
                    # feedbackID is simply a count of all given feedbacks
                    feedback_entry = (feedbackCount, [], [], feedback,
                                      feedbackType)

                    # record that feedback was given in feedback stats
                    feedbackStats.append("1")
                    feedbackStats.append("")
                    feedbackStats.append("")

                    # some main terms are made of two different conditions, e.g. MRT & NOT Roentgen (these are two options)
                    # a term made of various words is still one option, e.g. Hepatitis Serologie
                    for i, option in enumerate(terms_main):
                        #print("Option " + str(i) + ": " + ''.join(option))

                        # the last feedback often has "andere" (others) as the main term and all possible terms as synonyms
                        # in this case, the terms_main contains only the empty set, so the only option is empty

                        # if the option is a negated term (so the term was not found in the essay),
                        #  record the negated term and all its synonyms that were not found in the essay in the feedback file
                        if len(option) != 0 and "NOT" in option[0]:
                            feedback_f.write(
                                "\nTRIGGER: None of the following terms detected\n"
                            )
                            feedback_f.writelines(
                                sorted([
                                    t.strip() + ",  "
                                    for t in (option + terms_synonyms[i])
                                ]))

                            # record reason for feedback in feedback stats as the main term that was not found
                            if feedbackStats[statscount * 3 + 2] == "":
                                feedbackStats[statscount * 3 +
                                              2] = ''.join(option)
                            # if this is not the first option
                            else:
                                feedbackStats[statscount * 3 +
                                              2] += " & " + ''.join(option)

                        # if the option is a positive term (and the term or a synonym was found)
                        else:
                            feedback_f.write(
                                "\nTRIGGER: Detection of the following terms\n"
                            )
                            feedbackString = ""

                            # if the main term is detected, report only this, else report detected synonyms
                            l = useful_terms_main_advanced[i] if len(useful_terms_main_advanced[i]) != 0 \
                                else useful_terms_synonyms_advanced[i]
                            syn = False if len(
                                useful_terms_main_advanced[i]) != 0 else True

                            # for each option, various terms may have been found (especially regarding synonyms)
                            for found, targetTerm in l:
                                same = False
                                # various words in the text may be associated with a targetTerm, they are listed in found
                                # e.g. ((8, 'körperlicher'), (9, 'untersuchung')) or if only one word ((14, 'labors'),)
                                foundlist = list(found)
                                tokenstring = ""
                                indexlist = []
                                # get the complete found string and associated word indices (indices of tokenized text)
                                for index, token in foundlist:
                                    if tokenstring == "":
                                        tokenstring = token
                                    else:
                                        tokenstring = tokenstring + " " + token
                                    indexlist.append(index)

                                # get the start and end character offsets of the found words
                                startchars = token_offsets[indexlist[0]]
                                endchars = token_offsets[indexlist[-1]]
                                start = startchars[0]
                                end = endchars[1]

                                # add the offsets and the found term to the feedback to be transmitted to CASUS
                                # various synonyms may be associated with the same word in the text
                                # in this case don't add the term twice to the list of words that triggered the feedback
                                if not tokenstring in feedback_entry[2]:
                                    feedback_entry[1].append((start, end))
                                    feedback_entry[2].append(tokenstring)

                                # if the found term matches exactly the main term (or synonym)
                                # this is recorded with a different wording in the feedback file
                                if tokenstring.lower() == targetTerm.lower():
                                    text_feedback =  " wurde im Text gefunden an Position " + str(indexlist) +\
                                                     " (" + str(start) + "-" + str(end) + ")"
                                else:
                                    text_feedback = " " + str(indexlist) + " (" + str(start) + "-" + str(end) + \
                                                     ") im Text wurde assoziiert mit " + targetTerm

                                # various found terms in the essay (e.g. of various synonyms) are separated by ";"
                                pre = "" if feedbackString == "" else "; "
                                feedbackString = feedbackString + pre + tokenstring + text_feedback

                                # option == [] means that the main term is "andere" and the "synonyms" are simply a list of
                                # terms such that if found the feedback is triggered (but we do not wan to call them "synonyms" of andere)
                                if option != [] and syn:
                                    feedbackString = feedbackString + " und ist ein Synonym von " + ''.join(
                                        option)

                                # record given term and found term for feedback stats
                                if feedbackStats[statscount * 3 + 2] == "":
                                    feedbackStats[statscount * 3 +
                                                  2] = targetTerm
                                    feedbackStats[statscount * 3 +
                                                  3] = tokenstring
                                elif pre == "":
                                    feedbackStats[statscount * 3 +
                                                  2] += "; " + targetTerm
                                    feedbackStats[statscount * 3 +
                                                  3] += "; " + tokenstring
                                else:
                                    feedbackStats[statscount * 3 +
                                                  2] += " & " + targetTerm
                                    feedbackStats[statscount * 3 +
                                                  3] += " & " + tokenstring

                            feedback_f.write(feedbackString)

                    feedback_f.write("\n\nFEEDBACK:\n")
                    feedback_f.write(feedback + "\n\n")
                    feedback_f.write(
                        "_____________________________________________\n")
                    feedbackList.append(feedback_entry)
                    feedbackCount += 1

                # if the feedback is not given since conditions not satisfied
                # this is only recorded in feedback stats - no feedback recorded in feedback file or given to CASUS
                else:
                    feedbackStats.append("0")
                    feedbackStats.append("")
                    feedbackStats.append("")

                    # some main terms are made of two different conditions, e.g. MRT & NOT Roentgen (these are two options)
                    # a term made of various words is still one option, e.g. Hepatitis Serologie
                    for i, option in enumerate(terms_main):

                        # if this is the "andere" row (thus none of the specified terms was found)
                        # record "andere" as main term (and nothing in the found column)
                        if len(option) == 0:
                            feedbackStats[statscount * 3 + 2] = "andere"

                        else:
                            neg = True if "NOT" in option[0] else False

                            # if no terms for this option found in the essay, i.e.
                            # a) term is negative and satisfied (so not found in essay)
                            # b) term is positive and not satisfied (so not found in essay)
                            # record the main term (and nothing in the found column)
                            # NOTE: satisfied option is possible if various options and one of the others is not satisfied
                            if (fulfilled[i] == True
                                    and neg == True) or (fulfilled[i] == False
                                                         and neg == False):
                                if feedbackStats[statscount * 3 + 2] == "":
                                    feedbackStats[statscount * 3 +
                                                  2] = ''.join(option)
                                else:
                                    feedbackStats[statscount * 3 +
                                                  2] += " & " + ''.join(option)

                            # if terms for this option found in the essay
                            # record both the given term and the found version of it
                            else:
                                l = useful_terms_main_advanced[i] if len(useful_terms_main_advanced[i]) != 0 \
                                    else useful_terms_synonyms_advanced[i]
                                for x, (found, targetTerm) in enumerate(l):
                                    foundlist = list(found)
                                    tokenstring = ""
                                    for index, token in foundlist:
                                        if tokenstring == "":
                                            tokenstring = token
                                        else:
                                            tokenstring = tokenstring + " " + token

                                    if feedbackStats[statscount * 3 + 2] == "":
                                        feedbackStats[statscount * 3 +
                                                      2] = targetTerm
                                        feedbackStats[statscount * 3 +
                                                      3] = tokenstring
                                    # if this is not the first found term and the list isn't empty
                                    elif x > 0:
                                        feedbackStats[statscount * 3 +
                                                      2] += "; " + targetTerm
                                        feedbackStats[statscount * 3 +
                                                      3] += "; " + tokenstring
                                    # if this is the first found term and the list isn't empty, i.e. another option has
                                    # been recorded before
                                    else:
                                        feedbackStats[statscount * 3 +
                                                      2] += " & " + targetTerm
                                        feedbackStats[statscount * 3 +
                                                      3] += " & " + tokenstring

                statscount += 1
            #print(feedbackStats)

            # write the feedback stats to the stats CSV file
            with open(RESULTS_PATH + "stats_case" + self.caseId +
                      ".csv") as inf, open(
                          RESULTS_PATH + "stats_caseNew" + self.caseId +
                          ".csv", 'w') as outf:
                reader = csv.reader(inf, delimiter=",")
                writer = csv.writer(outf, delimiter=",")
                # check if the essay name already exists, i.e. if a user submitted a solution for this case before
                # if so, the old stats are replaced with the new
                found = False
                for line in reader:
                    if line[0] == self.essay_name:
                        writer.writerow(feedbackStats)
                        found = True
                    else:
                        writer.writerow(line)
                if not found:
                    writer.writerow(feedbackStats)

            os.remove(RESULTS_PATH + "stats_case" + self.caseId + ".csv")
            os.rename(RESULTS_PATH + "stats_caseNew" + self.caseId + ".csv",
                      RESULTS_PATH + "stats_case" + self.caseId + ".csv")

        return feedbackList

コード例 #10

0

ファイルを表示

ファイル: data_loading.py プロジェクト: MerlePfau/lt2316-h20-aa2

    def fillout_frames(self, filename_list):
        #reads in all the xml files and fills the two dataframes with the corresponding values
        #also creates mapping from tokens and ners to ids

        #initiate lists
        data_list = [[
            "sentence_id", "token_id", "char_start_id", "char_end_id", "split"
        ]]
        ner_list = [["sentence_id", "ner_id", "char_start_id", "char_end_id"]]

        #initiate word and ner mapping dictionaries
        self.id2word = {}
        self.id2ner = {}
        self.id2ner[0] = 'None'
        punct = "-,.?!:;"
        ner_id = 1
        word_id = 1
        #start reading in the files
        for filename in filename_list:
            #get split from pathname and create validation set
            if 'Test' in str(filename):
                split = 'test'
            else:
                split = random.choices(["train", "val"], weights=(75, 25),
                                       k=1)[0]  # split train into train
            #access xml data
            tree = ET.parse(filename)
            root = tree.getroot()
            for elem in root:
                #get sent_id
                sent_id = elem.get("id")
                #get tokens from sentence
                sentence = elem.get("text")
                sentence = sentence.replace(";", " ")
                sentence = sentence.replace("/", " ")
                tokenizer = RegexpTokenizer("\s|:|;", gaps=True)
                tokenized = tokenizer.tokenize(sentence)
                tokenized = [
                    word.strip(punct) if word[-1] in punct else word
                    for word in tokenized
                ]
                span = list(tokenizer.span_tokenize(sentence))
                char_ids = []
                for tpl in span:
                    char_ids.append((tpl[0], (tpl[1] - 1)))
                for i, token in enumerate(
                        tokenized):  # creating data_df_list, one_sentence
                    if token not in self.id2word.values():
                        self.id2word[word_id] = token
                        word_id += 1
                    token_id = self.get_id(token, self.id2word)
                    word_tpl = (sent_id, token_id, int(char_ids[i][0]),
                                int(char_ids[i][1]), split
                                )  # one row in data_df
                    data_list.append(word_tpl)

                for subelem in elem:
                    if subelem.tag == "entity":
                        #get ner
                        ner = subelem.get("type")
                        #update ner id dict
                        if ner not in self.id2ner.values():
                            self.id2ner[ner_id] = ner
                            ner_id += 1
                        label = self.get_id(ner, self.id2ner)
                        #get char_start_id and char_end_id
                        if ";" not in subelem.get("charOffset"):
                            char_start, char_end = subelem.get(
                                "charOffset").split("-")
                            char_start, char_end = int(char_start), int(
                                char_end)
                            #add row in data_df for current entity
                            ner_list.append(
                                [sent_id, label, char_start, char_end])
                        #if more than one mention of an entity, split into several lines
                        else:
                            occurences = subelem.get("charOffset").split(";")
                            for occurence in occurences:
                                char_start, char_end = occurence.split("-")
                                char_start, char_end = int(char_start), int(
                                    char_end)
                                #add row in data_df for current entity
                                ner_list.append(
                                    [sent_id, label, char_start, char_end])

        self.data_df = pd.DataFrame(data_list[1:], columns=data_list[0])
        self.ner_df = pd.DataFrame(ner_list[1:], columns=ner_list[0])

        pass

コード例 #11

0

ファイルを表示

ファイル: ner.py プロジェクト: DmKazakov/Natural-Language-Processing

parse_dict("ORG", "PER", glob.glob("resources/Collection5/*.ann"), lambda s : tokenize1(s))
parse_dict("Org", "Person", glob.glob("resources/testset/*.objects"), lambda s : tokenize2(s))
context = et.iterparse("resources/dict.opcorpora.xml", tag='lemma')
for (_, element) in context:
    tag = Tag.NONE
    lemma = element[0]
    for g in lemma:
        if g.attrib['v'] in tag_map:
            tag = tag_map[g.attrib['v']]
    if tag != Tag.NONE:
        for form in element[1:]:
            root.add([form.attrib['t']], tag)

with open("result.txt", "w") as result_file:
    with open("resources/dataset_40163_1.txt", "r") as dataset:
        for sentence in dataset:
            sentence = preprocess(sentence)
            tokens = [lemmatize(token) for token in tokenizer.tokenize(sentence)]
            positions = list(tokenizer.span_tokenize(sentence))
            current_index = 0
            while current_index < len(tokens):
                (tag, size) = root.get_first_match(tokens[current_index:])
                #if tag == Tag.NONE:
                #    (tag, size) = predict(tokens[current_index:])
                for index in range(current_index, current_index + size):
                    result_file.write(f"{positions[index][0]} {positions[index][1] - positions[index][0]} {tag.name} ")
                if tag == Tag.NONE:
                    size = 1
                current_index += size
            result_file.write("EOL\n")

コード例 #12

0

ファイルを表示

ファイル: extract_dates.py プロジェクト: stanislavapiskyulieva/final-year-project

# do_not_remove = [':', '-', '\\', '/']
# remove_punctuation = [p for p in punctuation if p not in do_not_remove]
# print remove_punctuation
data_dir = "../data/raw_harvard_tlink"
labels = []
for file in os.listdir(data_dir):
    if not file.endswith('.txt'):
        continue

    startIndices, endIndices = getStartEndIndices(file)

    f = open(os.path.join(data_dir, file), 'r')
    raw = f.read()
    tokenizer = RegexpTokenizer(patterns)
    span_generator = tokenizer.span_tokenize(raw)
    spans = [span for span in span_generator]
    # tokenizer = RegexpTokenizer(patterns)
    words = tokenizer.tokenize(raw)
    chunkStart = False
    offset = 0
    phraseBeginsAt = 0
    for i in range(len(words)):
        startIndex = spans[i][0] + 1
        endIndex = spans[i][1] + 1
        if str(startIndex) in startIndices and str(endIndex) in endIndices:
            label = "B-TIMEX3"
            chunkStart = False
        elif str(startIndex) in startIndices:
            label = "B-TIMEX3"
            chunkStart = True

コード例 #13

0

ファイルを表示

ファイル: api_annotation_parser.py プロジェクト: Nirmalkumar9182/nlp-assignment

        # only using finditer was i able to obtain the results without greedy search
        # the function returns an iteratable so we have to use a for loop on it
        for x in re.finditer(p, line):
            # x.group(0) contains the regex matched expression
            wordList.append(x.group(0))

posTagList = nltk.pos_tag(wordList)
annotateIndexes = []

with open(fname + '.ann', 'r', encoding='utf8') as fp:
    for line in fp:
        wordArr = line.split("\t")
        indices = wordArr[1].split(" ")
        annotateIndexes.append(indices)

wordSpanIndex = list(re_tokenizer.span_tokenize(s))

index = 0

if not annotate:
    annotateList = posTagList

while index < len(posTagList) and annotate:                    
    x = posTagList[index]
    annotated = False

    totalNewlines = s[0:wordSpanIndex[index][0]].count('\n\n')
    spanStart = wordSpanIndex[index][0] # + totalNewlines
            
    for annotateObj in annotateIndexes:
        tag = annotateObj[0]