コード例 #1
0
ファイル: sbd.py プロジェクト: anurupborah2001/lancet
    def train(self, doc):

        sys.stderr.write('training nb... ')
        feats = collections.defaultdict(sbd_util.Counter)
        totals = sbd_util.Counter()

        frag = doc.frag
        while frag:
            for feat, val in frag.features.items():
                feats[frag.label][feat + '_' + val] += 1
            totals[frag.label] += len(frag.features)
            frag = frag.next

        ## add-1 smoothing and normalization
        sys.stderr.write('smoothing... ')
        smooth_inc = 0.1
        all_feat_names = set(feats[True].keys()).union(set(
            feats[False].keys()))
        for label in [0, 1]:
            totals[label] += (len(all_feat_names) * smooth_inc)
            for feat in all_feat_names:
                feats[label][feat] += smooth_inc
                feats[label][feat] /= totals[label]
                self.feats[(label, feat)] = feats[label][feat]
            feats[label]['<prior>'] = totals[label] / totals.totalCount()
            self.feats[(label, '<prior>')] = feats[label]['<prior>']

        sys.stderr.write('done!\n')
コード例 #2
0
ファイル: sbd.py プロジェクト: wpli/splitta
    def get_stats(self, verbose):
        if verbose: sys.stderr.write('getting statistics... ')
        lower_words = sbd_util.Counter()
        non_abbrs = sbd_util.Counter()
        
        frag = self.frag
        while frag:
            for word in frag.tokenized.split():
                if word.replace('.', '').isalpha():
                    if word.islower(): lower_words[word.replace('.','')] += 1
                    if not word.endswith('.'): non_abbrs[word] += 1
            frag = frag.next

        if verbose: sys.stderr.write('lowercased [%d] non-abbrs [%d]\n'
                                     %(len(lower_words), len(non_abbrs)))

        return lower_words, non_abbrs
コード例 #3
0
ファイル: sbd.py プロジェクト: wpli/splitta
    def classify_nb_one(self, frag):
        ## the prior is weird, but it works better this way, consistently
        probs = sbd_util.Counter([(label, self.feats[label, '<prior>']**4) for label in [0,1]])
        for label in probs:
            for feat, val in frag.features.items():
                key = (label, feat + '_' + val)
                if not key in self.feats: continue
                probs[label] *= self.feats[key]

        probs = sbd_util.normalize(probs)
        return probs[1]
コード例 #4
0
ファイル: sbd.py プロジェクト: anurupborah2001/lancet
def get_text_data(text, expect_labels=True, tokenize=False, verbose=False):
    """
    get text, returning an instance of the Doc class
    doc.frag is the first frag, and each points to the next
    """

    frag_list = None
    word_index = 0
    frag_index = 0
    curr_words = []
    lower_words, non_abbrs = sbd_util.Counter(), sbd_util.Counter()

    for line in text.splitlines():

        ## deal with blank lines
        if (not line.strip()) and frag_list:
            if not curr_words: frag.ends_seg = True
            else:
                frag = Frag(' '.join(curr_words))
                frag.ends_seg = True
                if expect_labels: frag.label = True
                prev.next = frag
                if tokenize:
                    tokens = word_tokenize.tokenize(frag.orig)
                frag.tokenized = tokens
                frag_index += 1
                prev = frag
                curr_words = []

        for word in line.split():
            curr_words.append(word)

            if is_sbd_hyp(word):
                frag = Frag(' '.join(curr_words))
                if not frag_list: frag_list = frag
                else: prev.next = frag

                ## get label; tokenize
                if expect_labels: frag.label = int('<S>' in word)
                if tokenize:
                    tokens = word_tokenize.tokenize(frag.orig)
                else:
                    tokens = frag.orig
                tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens)
                frag.tokenized = tokens

                frag_index += 1
                prev = frag
                curr_words = []

            word_index += 1

    ## last frag
    frag = Frag(' '.join(curr_words))
    if not frag_list: frag_list = frag
    else: prev.next = frag
    if expect_labels: frag.label = int('<S>' in word)
    if tokenize:
        tokens = word_tokenize.tokenize(frag.orig)
    else:
        tokens = frag.orig
    tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens)
    frag.tokenized = tokens
    frag.ends_seg = True
    frag_index += 1

    if verbose:
        sys.stderr.write(' words [%d] sbd hyps [%d]\n' %
                         (word_index, frag_index))

    ## create a Doc object to hold all this information
    doc = Doc(frag_list)
    return doc
コード例 #5
0
ファイル: sbd.py プロジェクト: MirRaonaq/SBD
def get_data(files,
             expect_labels=True,
             tokenize=False,
             verbose=False,
             files_already_opened=False):
    """
    load text from files, returning an instance of the Doc class
    doc.frag is the first frag, and each points to the next
    """

    if type(files) == type(''): files = [files]
    frag_list = None
    word_index = 0
    frag_index = 0
    curr_words = []
    lower_words, non_abbrs = sbd_util.Counter(), sbd_util.Counter()

    for file in files:
        sys.stderr.write('reading [%s]\n' % file)

        #fh = open(file)
        if files_already_opened:
            fh = file
        else:
            fh = open(file)

        for line in fh:

            ## deal with blank lines
            if (not line.strip()) and frag_list:
                if not curr_words: frag.ends_seg = True
                else:
                    frag = Frag(' '.join(curr_words))
                    frag.ends_seg = True
                    if expect_labels: frag.label = True
                    prev.next = frag
                    if tokenize:
                        tokens = word_tokenize.tokenize(frag.orig)
                    frag.tokenized = tokens
                    frag_index += 1
                    prev = frag
                    curr_words = []

            for word in line.split():
                curr_words.append(word)

                if is_sbd_hyp(word):
                    #if True: # hypothesize all words
                    frag = Frag(' '.join(curr_words))
                    if not frag_list: frag_list = frag
                    else: prev.next = frag

                    ## get label; tokenize
                    if expect_labels: frag.label = int('<S>' in word)
                    if tokenize:
                        tokens = word_tokenize.tokenize(frag.orig)
                    else:
                        tokens = frag.orig
                    tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens)
                    frag.tokenized = tokens

                    frag_index += 1
                    prev = frag
                    curr_words = []

                word_index += 1

        if files_already_opened:
            pass
        else:
            fh.close()
        #fh.close()

        ## last frag
        frag = Frag(' '.join(curr_words))
        if not frag_list: frag_list = frag
        else: prev.next = frag
        if expect_labels: frag.label = int('<S>' in word)
        if tokenize:
            tokens = word_tokenize.tokenize(frag.orig)
        else:
            tokens = frag.orig
        tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens)
        frag.tokenized = tokens
        frag.ends_seg = True
        frag_index += 1

    if verbose:
        sys.stderr.write(' words [%d] sbd hyps [%d]\n' %
                         (word_index, frag_index))

    ## create a Doc object to hold all this information
    doc = Doc(frag_list)
    return doc