Beispiel #1
0
    def __init__(self, dir, corpus_name):

        self.dir = dir
        self.corpus_name = corpus_name
        self.unique_token = defaultdict(int)

        self.f_token = open(joinp(self.dir, self.corpus_name + ".tok"), 'a')
        self.token_writer = csv.writer(self.f_token, delimiter='\t', lineterminator='\n', quotechar='',
                                       quoting=csv.QUOTE_NONE)

        self.f_raw_token = open(joinp(self.dir, self.corpus_name + ".raw_token"), 'a')
        self.raw_token_writer = csv.writer(self.f_raw_token, delimiter='\t', lineterminator='\n', quotechar='',
                                           quoting=csv.QUOTE_NONE)

        self.f_sent = open(joinp(self.dir, self.corpus_name + ".sentences"), 'a')
        self.sent_writer = csv.writer(self.f_sent, delimiter='\t', lineterminator='\n', quotechar='',
                                      quoting=csv.QUOTE_NONE)

        self.emoticon_re = re.compile(emoticon_string, re.VERBOSE | re.I | re.UNICODE)
        # TODO: use path from config or cmdline
        self.standard_abbreviations = load('preprocessor/config/data/abbrev')
Beispiel #2
0
 def __init__(self, list_of_stopwords):
     self._stopwords = load(list_of_stopwords)