Esempio n. 1
0
 def _getMLU(self, fileid):
     sents = self._get_words(fileid, speaker='CHI', sent=True, stem=True, 
                 relation=False, pos=True, strip_space=True, replace=True)
     results = []
     lastSent = []
     numFillers = 0
     for sent in sents:
         posList = [pos for (word,pos) in sent]
         # if any part of the sentence is intelligible
         if any(pos == 'unk' for pos in posList):
             next
         # if the sentence is null
         elif sent == []:
             next
         # if the sentence is the same as the last sent
         elif sent == lastSent:
             next
         else:
             results.append([word for (word,pos) in sent])
             # count number of fillers
             numFillers += posList.count('co')
             numFillers += posList.count('None')
         lastSent = sent
     try:
         thisWordList = flatten(results)
         # count number of morphemes (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
         numWords = float(len(flatten([word.split('-') for word in thisWordList]))) - numFillers
         numSents = float(len(results))
         mlu = numWords/numSents
     except ZeroDivisionError:
         mlu = 0
     # return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents}
     return mlu
Esempio n. 2
0
 def get_tag_sequence(self):
     self.tags = flatten([max([(x,y) for x,y in self.Table.items() if x[0] == self.n], key=lambda k: k[-1]["score"])[0][-2:]])
     i = range(1, self.n - 1)
     i.reverse()
     i = array(i) + 2
     for k in i:
         self.tags.insert(0, self.Table[tuple(flatten(k, self.tags[:2]))]["t"])
Esempio n. 3
0
 def get_tag_sequence(self):
     self.tags = flatten([
         max([(x, y) for x, y in self.Table.items() if x[0] == self.n],
             key=lambda k: k[-1]["score"])[0][-2:]
     ])
     i = range(1, self.n - 1)
     i.reverse()
     i = array(i) + 2
     for k in i:
         self.tags.insert(0, self.Table[tuple(flatten(k,
                                                      self.tags[:2]))]["t"])
Esempio n. 4
0
 def _getMLU(self, fileid, speaker):
     sents = self._get_words(
         fileid,
         speaker=speaker,
         sent=True,
         stem=True,
         relation=False,
         pos=True,
         strip_space=True,
         replace=True,
     )
     results = []
     lastSent = []
     numFillers = 0
     sentDiscount = 0
     for sent in sents:
         posList = [pos for (word, pos) in sent]
         # if any part of the sentence is intelligible
         if any(pos == "unk" for pos in posList):
             continue
         # if the sentence is null
         elif sent == []:
             continue
         # if the sentence is the same as the last sent
         elif sent == lastSent:
             continue
         else:
             results.append([word for (word, pos) in sent])
             # count number of fillers
             if len(set(["co", None]).intersection(posList)) > 0:
                 numFillers += posList.count("co")
                 numFillers += posList.count(None)
                 sentDiscount += 1
         lastSent = sent
     try:
         thisWordList = flatten(results)
         # count number of morphemes
         # (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
         numWords = (
             len(flatten([word.split("-") for word in thisWordList])) - numFillers
         )
         numSents = len(results) - sentDiscount
         mlu = numWords / numSents
     except ZeroDivisionError:
         mlu = 0
     # return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents}
     return mlu
Esempio n. 5
0
 def _getMLU(self, fileid, speaker):
     sents = self._get_words(
         fileid,
         speaker=speaker,
         sent=True,
         stem=True,
         relation=False,
         pos=True,
         strip_space=True,
         replace=True,
     )
     results = []
     lastSent = []
     numFillers = 0
     sentDiscount = 0
     for sent in sents:
         posList = [pos for (word, pos) in sent]
         # if any part of the sentence is intelligible
         if any(pos == 'unk' for pos in posList):
             continue
         # if the sentence is null
         elif sent == []:
             continue
         # if the sentence is the same as the last sent
         elif sent == lastSent:
             continue
         else:
             results.append([word for (word, pos) in sent])
             # count number of fillers
             if len(set(['co', None]).intersection(posList)) > 0:
                 numFillers += posList.count('co')
                 numFillers += posList.count(None)
                 sentDiscount += 1
         lastSent = sent
     try:
         thisWordList = flatten(results)
         # count number of morphemes
         # (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
         numWords = (
             len(flatten([word.split('-') for word in thisWordList])) - numFillers
         )
         numSents = len(results) - sentDiscount
         mlu = numWords / numSents
     except ZeroDivisionError:
         mlu = 0
     # return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents}
     return mlu
Esempio n. 6
0
 def package(self):
     n = len(self.sent)
     vals = [flatten(x) for x in enumerate(self.tri())]
     return [{
         "w": self.sent,
         "i": i,
         "y2": y2,
         "y1": y1,
         "y": y
     } for i, y2, y1, y in vals]
Esempio n. 7
0
 def add(self, key, value):
     if value is not None and len(value) > 0:
         if not self.contains(key):
             if len(value) == 1:
                 value = value[0]
             self[key] = value
         else:
             if len(value) == 1:
                 value = value[0]
             if not value in self[key]:
                 self[key] = flatten(self[key], value)
Esempio n. 8
0
 def _getMLU(self, fileid):
     sents = self._get_words(fileid,
                             speaker='CHI',
                             sent=True,
                             stem=True,
                             relation=False,
                             pos=True,
                             strip_space=True,
                             replace=True)
     results = []
     lastSent = []
     numFillers = 0
     for sent in sents:
         posList = [pos for (word, pos) in sent]
         # if any part of the sentence is intelligible
         if any(pos == 'unk' for pos in posList):
             next
         # if the sentence is null
         elif sent == []:
             next
         # if the sentence is the same as the last sent
         elif sent == lastSent:
             next
         else:
             results.append([word for (word, pos) in sent])
             # count number of fillers
             numFillers += posList.count('co')
             numFillers += posList.count('None')
         lastSent = sent
     try:
         thisWordList = flatten(results)
         # count number of morphemes (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
         numWords = float(
             len(flatten([word.split('-')
                          for word in thisWordList]))) - numFillers
         numSents = float(len(results))
         mlu = numWords / numSents
     except ZeroDivisionError:
         mlu = 0
     # return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents}
     return mlu
Esempio n. 9
0
def get_paragraph_distances(get_embeddings, title, paragraphs):
    """Calculates the sorted distances of a question title to a number of paragraphs using sentence embeddings.

    Uses the min distance over all sentences in a paragraph for sorting.
    """
    title_tok_str = ' '.join(word_tokenize(title))
    title_embeddings = get_embeddings([title_tok_str.lower()])
    paragraph_sents_lowered = [[
        ' '.join(word_tokenize(s)).lower() for s in sent_tokenize(p)
    ] for p in paragraphs]
    paragraphs_embeddings = get_embeddings(flatten(paragraph_sents_lowered))
    distances = cdist(paragraphs_embeddings, title_embeddings,
                      'cosine').reshape(-1)

    distances_per_paragraph = []
    sents_processed = 0
    for sents in paragraph_sents_lowered:
        distances_per_paragraph.append(
            min(distances[sents_processed:sents_processed + len(sents)]))
        sents_processed += len(sents)

    return distances_per_paragraph
Esempio n. 10
0
def frequencies(*seq):
    seq = flatten(seq)
    freq = defaultdict(lambda: 0)
    for i in seq:
        freq[i] += 1
    return freq
    def _compare_with_conn(self, current_token, dir_is_left,
                           connective_instances, instance_under_construction):
        if dir_is_left:
            arc_direction = 'LEFT'
            first_uncompared_index = -1
            compared = self.lambda_2
            uncompared = self.lambda_1
        else:
            arc_direction = 'RIGHT'
            first_uncompared_index = 0
            compared = self.lambda_3
            uncompared = self.lambda_4

        conn_instance_index = 0
        conn_instance = connective_instances[conn_instance_index]
        other_connective_tokens = set(
            flatten([i.connective for i in connective_instances[1:]]))
        other_connective_tokens -= set(conn_instance.connective)
        last_modified_arc_type = None

        while uncompared:
            token_to_compare = uncompared[first_uncompared_index]

            # First, see if we should split. But don't split on leftward tokens.
            if (not dir_is_left and token_to_compare in other_connective_tokens
                    and self._last_op != 'SPLIT'):
                instance_under_construction = self._do_split(
                    current_token, last_modified_arc_type, token_to_compare,
                    instance_under_construction)
                # Move to next
                conn_instance_index += 1
                conn_instance = connective_instances[conn_instance_index]
                # Leave current token to be compared with new connective.
            else:
                # If there's a fragment, record it first, before looking at the
                # args. (The fragment word might still be part of an arg.)
                if (token_to_compare is not current_token and
                        self._last_op not in  # no fragments after splits/frags
                    ['SPLIT', "CONN-FRAG-{}".format(arc_direction)]
                        and token_to_compare in conn_instance.connective):
                    self._write_transition(
                        current_token, "CONN-FRAG-{}".format(arc_direction))
                    instance_under_construction.connective.append(
                        token_to_compare)

                arcs_to_add = []
                for arc_type in ['cause', 'effect', 'means']:
                    argument = getattr(conn_instance, arc_type, None)
                    if argument is not None and token_to_compare in argument:
                        arcs_to_add.append(arc_type)
                        # TODO: This will do odd things if there's ever a SPLIT
                        # interacting with a multiple-argument arc.
                        last_modified_arc_type = arc_type
                if arcs_to_add:
                    trans = "{}-ARC({})".format(
                        arc_direction,
                        ','.join(arc_type.title() for arc_type in arcs_to_add))
                    instance_under_construction = self._write_transition(
                        current_token, trans, True,
                        instance_under_construction)
                    for arc_type in arcs_to_add:
                        getattr(instance_under_construction,
                                arc_type).append(token_to_compare)
                else:
                    instance_under_construction = self._write_transition(
                        current_token, "NO-ARC-{}".format(arc_direction), True,
                        instance_under_construction)

                if dir_is_left:
                    compared.appendleft(uncompared.pop())
                else:
                    compared.append(uncompared.popleft())

        return instance_under_construction  # make update visible
Esempio n. 12
0
 def __init__(self, docs):
     self.tuples = flatten([Process(x).hist for x in docs])
Esempio n. 13
0
 def add(self, software_name, tweet):
     if self.contains(software_name):
         self[software_name]["tweets"] = flatten(self[software_name]["tweets"], tweet)
         self[software_name]["weight"] += 1
     else:
         self[software_name] = {"tweets": tweet, "weight": 1}
Esempio n. 14
0
 def package(self):
     n = len(self.sent)
     vals = [flatten(x) for x in enumerate(self.tri())]
     return [{"w":self.sent, "i":i, "y2":y2, "y1":y1, "y":y}  for i, y2, y1, y in vals]
Esempio n. 15
0
 def __init__(self, docs):
     self.tuples = flatten([Process(x).hist for x in docs])
Esempio n. 16
0
 def active_features(h, m, l=None):
     if l:
         active = flatten([f(h, m, l) for f in features.values()])
     else:
         active = flatten([f(h, m) for f in features.values()])
     return [x for x in active if model.has_key(x)]
Esempio n. 17
0
 def score(h, m, l=None):
     if l:
         active = flatten([f(h, m, l) for f in features.values()])
     else:
         active = flatten([f(h, m) for f in features.values()])
     return sum([model[x] for x in active if model.has_key(x)])
Esempio n. 18
0
 def score(h,m,l=None):
     if l:
         active = flatten([f(h,m,l) for f in features.values()])    
     else:
         active = flatten([f(h,m) for f in features.values()])
     return sum([model[x] for x in active if model.has_key(x)])
Esempio n. 19
0
 def active_features(h,m,l=None):
     if l:
         active = flatten([f(h,m,l) for f in features.values()])
     else:
         active = flatten([f(h,m) for f in features.values()])
     return [x for x in active if model.has_key(x)]
Esempio n. 20
0
Texte.
Der erste classifier lernt Texte in verschiedene Kategorien einzuordnen
Der zweite lernt welche Kategorien der User interessant findet"""

# 1500 zufällige Textpassagen aus dem Reuters
print 'Load corpus'
#corp = reuters.raw()
print 'Loaded corpus'
#rnd = np.random.randint(0,len(corp)/2,1500)
#raw_documents = [corp[i:i+300] for i in rnd]
print 'Created docs'

pdb.set_trace()
corp = brown.paras(categories='hobbies')
rnd = np.random.randint(0, len(corp) - 3, 300)
raw_documents = [flatten(corp[i:i + 3]) for i in rnd]
pdb.set_trace()
raw_doc2 = list()
for doc in raw_documents:
    raw_doc2.append(''.join(str(word) + " " for word in doc))
raw_documents = raw_doc2

pdb.set_trace()
#posts_j = json.load(open('cogsci.json'))
#posts = posts_j.values()
#raw_documents = list()
#for post in posts:
#    if post.has_key('message'):
#        raw_documents.append(post['message'])
#
max_docs = len(raw_documents)