コード例 #1
0
ファイル: preprocess.py プロジェクト: BMKEG/exp-parser
import os
from nltk.parse import stanford
import sys
import codecs
from nltk.tokenize import word_tokenize, sent_tokenize
import re

from path_reader import PathReader

pathreader = PathReader("./PATHS")

os.environ['STANFORD_PARSER'] = pathreader.get_path('PARSER')
os.environ['STANFORD_MODELS'] = pathreader.get_path('PARSER')
parser = stanford.StanfordParser()


def get_longest_cand(cands):
    maxlen = 0
    bestcand = ''
    for cand in cands:
        if len(cand) > maxlen:
            maxlen = len(cand)
            bestcand = cand
    return bestcand


def extract_sat_clause(tree, is_root=True):
    if len(tree) == 0 or type(tree) == unicode:
        return ""
    elif tree.label() == 'S' or tree.label() == 'SBAR':
        if is_root:
コード例 #2
0
ファイル: features.py プロジェクト: BMKEG/exp-parser
from nltk.tag import StanfordPOSTagger
from nltk.tokenize import word_tokenize
import re
import os

from path_reader import PathReader

pathreader = PathReader("./PATHS")

os.environ['CLASSPATH'] = pathreader.get_path('TAGGER')
os.environ['STANFORD_MODELS'] = pathreader.get_path('TAGGER') + "/models"


class FeatureProcessing(object):
    def __init__(self):
        self.feat_index = {}
        self.implication_words = ["demonstrate", "suggest", "indicate"]
        self.hyp_words = ["possible"]
        self.method_words = ["probe", "detect"]
        self.pos_tagger = StanfordPOSTagger(
            'english-bidirectional-distsim.tagger')

    def get_features(self, phrase, filter_feature='0'):
        words = word_tokenize(phrase)
        pos_tags = self.pos_tagger.tag(words)
        features = []
        for word, tag in pos_tags:
            wl = word.lower()
            # Feat 1: POS features
            if filter_feature != '1':
                if tag != ',' and tag != '.':
コード例 #3
0
ファイル: features.py プロジェクト: BMKEG/exp-parser
from nltk.tag import StanfordPOSTagger
from nltk.tokenize import word_tokenize
import re
import os

from path_reader import PathReader

pathreader = PathReader("./PATHS")

os.environ['CLASSPATH'] = pathreader.get_path('TAGGER')
os.environ['STANFORD_MODELS'] = pathreader.get_path('TAGGER') + "/models"

class FeatureProcessing(object):
  def __init__(self):
    self.feat_index = {}
    self.implication_words = ["demonstrate", "suggest", "indicate"]
    self.hyp_words = ["possible"]
    self.method_words = ["probe", "detect"]
    self.pos_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')

  def get_features(self, phrase, filter_feature='0'):
    words = word_tokenize(phrase)
    pos_tags = self.pos_tagger.tag(words)
    features = []
    for word, tag in pos_tags:
      wl = word.lower()
      # Feat 1: POS features
      if filter_feature != '1':
        if tag != ',' and tag != '.':
          features.append(tag)
      # Feat 2: Verb and adverb identity
コード例 #4
0
ファイル: preprocess.py プロジェクト: BMKEG/exp-parser
import os
from nltk.parse import stanford
import sys
import codecs
from nltk.tokenize import word_tokenize, sent_tokenize
import re

from path_reader import PathReader

pathreader = PathReader("./PATHS")

os.environ['STANFORD_PARSER'] = pathreader.get_path('PARSER')
os.environ['STANFORD_MODELS'] = pathreader.get_path('PARSER')
parser = stanford.StanfordParser()

def get_longest_cand(cands):
  maxlen = 0
  bestcand = ''
  for cand in cands:
    if len(cand) > maxlen:
      maxlen = len(cand)
      bestcand = cand
  return bestcand

def extract_sat_clause(tree, is_root = True):
  if len(tree) == 0 or type(tree) == unicode:
    return ""
  elif tree.label() == 'S' or tree.label() == 'SBAR':
    if is_root:
      return get_longest_cand([extract_sat_clause(t, is_root = False) for t in tree])
    else: