Esempio n. 1
0
 def __init__(self):
     self.dictionary = Dictionaries()
     self.code = ""
     self.blockCommaStack = []
     self.elementCommaStack = []
     self.attribCommaStack = []
     self.alterCommaStack = []
     self.condCommaStack = []
     self.indents = 0
     self.smoothContext = False
     self.alternativesContext = False
     self.conditionContext = False
     self.conditionalContext = False
 def __init__(self, pos, chunk, event, classify, mallet_memory='256m'):
     self.clear_line_counter()
     
     self.posTagger = pos_tagger_stdin.PosTagger() if pos else None
     self.chunkTagger = chunk_tagger_stdin.ChunkTagger() if chunk and pos else None
     self.eventTagger = event_tagger_stdin.EventTagger() if event and pos else None
     self.llda = GetLLda() if classify else None
     
     if pos and chunk:
         self.ner_model = 'ner.model'
     elif pos:
         self.ner_model = 'ner_nochunk.model'
     else:
         self.ner_model = 'ner_nopos_nochunk.model'
     
     self.ner = GetNer(self.ner_model, memory=mallet_memory)
     self.fe = Features.FeatureExtractor('%s/data/dictionaries' % (BASE_DIR))
     self.capClassifier = cap_classifier.CapClassifier()
     self.vocab = Vocab('%s/hbc/data/vocab' % (BASE_DIR))
     
     self.dictMap = {}
     self.dictMap = self.dictMap
     i = 1
     for line in open('%s/hbc/data/dictionaries' % (BASE_DIR)):
         dictionary = line.rstrip('\n')
         self.dictMap[i] = dictionary
         i += 1
     
     self.dict2index = {}
     for i in self.dictMap.keys():
         self.dict2index[self.dictMap[i]] = i
     
     if self.llda:
         self.dictionaries = Dictionaries('%s/data/LabeledLDA_dictionaries3' % (BASE_DIR), self.dict2index)
     self.entityMap = {}
     i = 0
     if self.llda:
         for line in open('%s/hbc/data/entities' % (BASE_DIR)):
             entity = line.rstrip('\n')
             self.entityMap[entity] = i
             i += 1
     
     self.dict2label = {}
     for line in open('%s/hbc/data/dict-label3' % (BASE_DIR)):
         (dictionary, label) = line.rstrip('\n').split(' ')
         self.dict2label[dictionary] = label
Esempio n. 3
0
    def __init__(self):
        self.numberLines = 0
        self.eventTagger = None
        self.posTagger = None
        self.chunkTagger = None
        self.llda = GetLLda()
        self.ner = GetNer('ner_nopos_nochunk.model')
        self.fe = Features.FeatureExtractor('%s/data/dictionaries' %
                                            (BASE_DIR))
        self.capClassifier = cap_classifier.CapClassifier()

        self.vocab = Vocab('%s/hbc/data/vocab' % (BASE_DIR))

        self.dictMap = {}
        i = 1
        for line in open('%s/hbc/data/dictionaries' % (BASE_DIR)):
            dictionary = line.rstrip('\n')
            self.dictMap[i] = dictionary
            i += 1

        dict2index = {}
        for i in self.dictMap.keys():
            dict2index[self.dictMap[i]] = i

        if self.llda:
            self.dictionaries = Dictionaries(
                '%s/data/LabeledLDA_dictionaries3' % (BASE_DIR), dict2index)
        self.entityMap = {}
        i = 0
        if self.llda:
            for line in open('%s/hbc/data/entities' % (BASE_DIR)):
                entity = line.rstrip('\n')
                self.entityMap[entity] = i
                i += 1

        self.dict2label = {}
        for line in open('%s/hbc/data/dict-label3' % (BASE_DIR)):
            (dictionary, label) = line.rstrip('\n').split(' ')
            self.dict2label[dictionary] = label
Esempio n. 4
0
class PythonCoder():
    def __init__(self):
        self.dictionary = Dictionaries()
        self.code = ""
        self.blockCommaStack = []
        self.elementCommaStack = []
        self.attribCommaStack = []
        self.alterCommaStack = []
        self.condCommaStack = []
        self.indents = 0
        self.smoothContext = False
        self.alternativesContext = False
        self.conditionContext = False
        self.conditionalContext = False
        self.spec_condition = []

    def getCode(self):
        return self.code

    def indent(self):
        return (" " * 4 * self.indents)

    def write(self, text):
        self.code += text

    def literalize(self, text, useUnderscore):
        if text[0] == '"':
            return text
        if useUnderscore:
            literalized = re.sub(
                '("[ _]+")', ' ',
                re.sub('("")', '"', re.sub('([a-zA-Z]+)', '"\\1"', text)))
        else:
            literalized = re.sub('([a-zA-Z_]+[a-zA-Z0-9_]*)', '"\\1"', text)
        return literalized

    def toCamelCase(self, text):
        return ''.join([x.capitalize() for x in text.split('_')])

    def replaceHexColorCode(self, match):
        value = match.group(1)
        if len(value) == 3:  # short group
            value = [str(round(int(c + c, 16) / 255.0, 3)) for c in value]
        elif len(value) == 6:
            value = [
                str(round(int(c1 + c2, 16) / 255.0, 3))
                for c1, c2 in zip(value[::2], value[1::2])
            ]
        else:
            raise Exception('Invalid hex number: #' + value)
        return '({}, 1.0)'.format(', '.join(value))

    def replaceRGBColorCode(self, match):
        rgb = match.group(0)
        values = rgb[4:-1].split(',')
        values.append('255')
        return str(tuple(round(c / 255., 3) for c in values))

    def replaceRGBAColorCode(self, match):
        rgba = match.group(0)
        values = rgba[5:-1].split(',')
        values[3] = str(255. * float(values[3]))
        return str(tuple(round(c / 255., 3) for c in values))

    def replaceColorsInText(self, text):
        # _hex_colour = re.compile(r'#([0-9a-fA-F]{3}|[0-9a-fA-F]{6})\b')
        _hex_colour = re.compile(r'#([0-9a-fA-F]+|[0-9a-fA-F]+)\b')
        text = _hex_colour.sub(self.replaceHexColorCode, text)

        _rgb_colour = re.compile(r'rgb\(\s*(?:(\d{1,3})\s*,?){3}\)')
        text = _rgb_colour.sub(self.replaceRGBColorCode, text)

        _rgba_colour = re.compile(r'rgba\(\s*(?:(\d{1,3})\s*,?){3},\d+\.\d+\)')
        text = _rgba_colour.sub(self.replaceRGBAColorCode, text)

        for word, initial in self.dictionary.colors.items():
            text = re.sub(r'\b' + word + r'\b', str(initial), text)
        return text

    # ------------------------------------------------------------
    # styles
    # : named_block+  EOF            #NAMED
    # | elements EOF                 #UNNAMED
    # ;
    # ------------------------------------------------------------

    def enterNAMED(self):
        self.indents = 1
        self.blockCommaStack.append('')
        self.write('styles = {\n')

    def exitNAMED(self):
        self.blockCommaStack.pop()
        self.write('\n}')

    def enterUNNAMED(self):
        self.indents = 1
        self.write('styles = [')

    def exitUNNAMED(self):
        self.write(']')

    # ------------------------------------------------------------
    # named_block
    #     : ('@name' STRING_LITERAL SEMI elements)
    #     ;
    # ------------------------------------------------------------

    def enterNamed_block(self, name):
        self.write(self.blockCommaStack[-1])
        self.write(self.indent() + name + ' : [')
        self.blockCommaStack[-1] = ',\n'
        self.indents += 1

    def exitNamed_block(self):
        self.write(self.indent() + ']')

    # ------------------------------------------------------------
    # elements
    #     : element ( element )*
    #     ;
    # ------------------------------------------------------------

    def enterElements(self):
        self.elementCommaStack.append('\n')
        self.attribCommaStack.append("\n")

    def exitElements(self):
        self.write("\n")
        self.indents -= 1
        self.elementCommaStack.pop()

    # ------------------------------------------------------------
    # element
    #     : '@'? 'level' (STRUDEL def_name)? spec_conditions* (LPAREN condition RPAREN)? LCURLY attributes RCURLY
    #     | '@'? element_name (STRUDEL def_name)? (LPAREN condition RPAREN)? LCURLY attributes RCURLY
    #     ;
    # ------------------------------------------------------------

    def enterElement_name(self, name):
        txt = self.toCamelCase(name)
        self.write(self.indent() + txt + '(')
        self.indents += 1

    def enterElement(self, txt):
        self.write(self.elementCommaStack[-1])
        self.attribCommaStack.append("\n")  # maybe we have a condition first
        if 'level' in txt[:15]:
            self.write(self.indent() + 'Level(')
            self.indents += 1

    def exitElement(self):
        self.indents -= 1
        self.write(self.indent() + ')')
        self.elementCommaStack[-1] = ",\n"

    def enterCondition(self):
        self.conditionContext = True
        self.write(self.attribCommaStack[-1])
        self.write(self.indent() + 'condition = lambda item : ')
        self.attribCommaStack[-1] = ',\n'

    def exitCondition(self):
        self.conditionContext = False
        self.attribCommaStack[-1] = ',\n'

    # ------------------------------------------------------------
    # attributes
    #     : attribute*
    #     ;
    # ------------------------------------------------------------

    def enterAttributes(self):
        pass
#      self.attribCommaStack.append("\n")

    def exitAttributes(self):
        self.write("\n")
        self.attribCommaStack.pop()

    # ------------------------------------------------------------
    # attribute
    #     : 'symmetry' COLON sym_expression SEMI
    #     | 'use' COLON use_expression SEMI
    #     | ('faces' | 'sharpEdges') COLON smooth_expression SEMI
    #     | attr_name COLON expression SEMI
    #     | attr_name COLON markup_block  // markup
    #     ;
    # sym_expression
    #     : IDENTIFIER
    #     ;
    # use_expression
    #     : IDENTIFIER (COMMA IDENTIFIER)*
    #     ;
    # smooth_expression
    #     : expression
    #     ;
    # markup_block
    #     : LBRACK elements RBRACK
    #     ;
    # ------------------------------------------------------------

    def enterSym_expression(self, sym):
        self.write(self.attribCommaStack[-1])
        symmetry = self.toCamelCase(sym)
        self.write(self.indent() + 'symmetry = symmetry.' + symmetry)
        self.attribCommaStack[-1] = ",\n"

    def enterUse_expression(self, enterSimple_expr):
        self.write(self.attribCommaStack[-1])
        expression = self.literalize(enterSimple_expr, False)
        self.write(self.indent() + 'use = (' + expression + ',)')
        self.attribCommaStack[-1] = ",\n"

    def enterSmooth_expression(self, name):
        self.smoothContext = True
        self.write(self.attribCommaStack[-1])
        self.write(self.indent() + name + ' = ')
        self.attribCommaStack[-1] = ",\n"

    def exitSmooth_expression(self):
        self.smoothContext = False

    def enterMarkup_block(self):
        self.write(' [')
        self.indents += 1

    def exitMarkup_block(self):
        self.write(self.indent() + ']')

    def enterAttr_name(self, name):
        if name == 'class':  # avoid conflict with Python keyword
            name = 'cl'
        self.write(self.attribCommaStack[-1])
        self.write(self.indent() + name + ' = ')
        self.attribCommaStack[-1] = ",\n"

    # ------------------------------------------------------------
    # expression
    #     :  simple_expr | function | alternatives
    #     ;
    # alternatives
    #     : function (PIPE function)+
    #     ;
    # ------------------------------------------------------------

    def enterAlternatives(self):
        self.alternativesContext = True
        # self.inFunctionStack.append('inFunction')
        self.functionContext = True
        self.write("Value(Alternatives(\n")
        self.alterCommaStack.append("")
        self.indents += 1

    def exitAlternatives(self):
        self.alternativesContext = False
        self.alterCommaStack.pop()
        self.indents -= 1
        self.write('\n' + self.indent() + '))')

    # ------------------------------------------------------------
    # function
    #     : 'attr' LPAREN string_literal RPAREN           #ATTR
    #     | 'bldgAttr' LPAREN string_literal RPAREN       #BUILDATTR
    #     | 'random_normal' LPAREN NUMBER RPAREN          #RANDN
    #     | 'random_weighted' nested_list                 #RANDW
    #     | 'if' LPAREN conditional RPAREN function       #COND
    #     | 'use_from' LPAREN IDENTIFIER RPAREN           #USEFROM
    #     | 'per_building' LPAREN function RPAREN         #PERBUILD
    #     | 'rgb' LPAREN NUMBER COMMA NUMBER COMMA NUMBER RPAREN      #RGB
    #     | 'rgba' LPAREN NUMBER COMMA NUMBER COMMA NUMBER COMMA NUMBER RPAREN      #RGBA
    #     | constant                                      #CONST
    #     | nested_list                                   #NESTED
    #     | arith_atom                                    #ARITH
    #     ;
    # ------------------------------------------------------------

    def enterATTR(self, attribute):
        types = self.dictionary.getAttributeTypes(attribute)
        if self.alternativesContext:
            self.write(self.alterCommaStack[-1])
            if len(types) == 1:
                self.write(self.indent() + "FromAttr(" + attribute + ", " +
                           types[0] + ')')
            else:
                self.write(self.indent() + "FromAttr(" + attribute + ", " +
                           types[0] + '),\n')
                self.write(self.indent() + "FromAttr(" + attribute + ", " +
                           types[1] + ')')
            self.alterCommaStack[-1] = ",\n"
        else:
            if len(types) == 1:
                self.write("Value(FromAttr(" + attribute + ", " + types[0] +
                           '))')
            else:
                self.write('Value(Alternatives(\n')
                self.indents += 1
                self.write(self.indent() + "FromAttr(" + attribute + ", " +
                           types[0] + '),\n')
                self.write(self.indent() + "FromAttr(" + attribute + ", " +
                           types[1] + ')\n')
                self.indents -= 1
                self.write(self.indent() + '))')

    def enterBUILDATTR(self, attribute):
        types = self.dictionary.getAttributeTypes(attribute)
        if self.alternativesContext:
            self.write(self.alterCommaStack[-1])
            if len(types) == 1:
                self.write(self.indent() + "FromBldgAttr(" + attribute + ", " +
                           types[0] + ')')
            else:
                self.write(self.indent() + "FromBldgAttr(" + attribute + ", " +
                           types[0] + '),\n')
                self.write(self.indent() + "FromBldgAttr(" + attribute + ", " +
                           types[1] + ')')
            self.alterCommaStack[-1] = ",\n"
        else:
            if len(types) == 1:
                self.write("Value(FromBldgAttr(" + attribute + ", " +
                           types[0] + '))')
            else:
                self.write('Value(Alternatives(\n')
                self.indents += 1
                self.write(self.indent() + "FromBldgAttr(" + attribute + ", " +
                           types[0] + '),\n')
                self.write(self.indent() + "FromBldgAttr(" + attribute + ", " +
                           types[1] + ')\n')
                self.indents -= 1
                self.write(self.indent() + '))')

    # ------------------------------------------------------------
    # function
    #     : ...
    #     | 'random_normal' LPAREN NUMBER RPAREN          #RANDN
    #     | 'random_weighted' nested_list                 #RANDW
    #     | ...
    # ------------------------------------------------------------

    def enterRANDN(self, value):
        if self.alternativesContext or self.conditionContext:
            self.write(self.alterCommaStack[-1])
            self.write(self.indent() + 'RandomNormal( ' + value + ' )')
            self.alterCommaStack[-1] = ",\n"
        else:
            self.write('Value(RandomNormal( ' + value + ' ))')

    def enterRANDW(self, li):
        li = self.replaceColorsInText(li)
        list = self.literalize(li, False)
        if self.alternativesContext or self.conditionContext:
            self.write(self.alterCommaStack[-1])
            self.write(self.indent() + 'RandomWeighted( ' + list + ' )')
            self.alterCommaStack[-1] = ",\n"
        else:
            self.write('Value(RandomWeighted( ' + list + ' ))')

    # ------------------------------------------------------------
    # function
    #     : ...
    #     | 'if' LPAREN conditional RPAREN function       #COND
    #     | ...
    # ------------------------------------------------------------

    def enterCOND(self, condition, result):
        self.conditionContext = True
        self.conditionalContext = True
        if self.alternativesContext:
            self.write(self.alterCommaStack[-1])
            self.write(self.indent() + "Conditional(\n")
            self.indents += 1
            self.write(self.indent() + 'lambda item: ')
            self.alterCommaStack.append(',\n')
        else:
            self.write("Value(Conditional(\n")
            self.indents += 1
            self.write(self.indent() + 'lambda item: ')

    def exitCOND(self):
        self.indents -= 1
        self.write("\n")
        if self.conditionContext:
            if self.alternativesContext:
                self.write(self.indent() + ")")
            else:
                self.write(self.indent() + "))")
        else:
            self.write(self.indent() + ")")
        self.conditionContext = False
        self.conditionalContext = False

    # ------------------------------------------------------------
    # function
    #     : ...
    #     | 'use_from' LPAREN IDENTIFIER RPAREN           #USEFROM
    #     | 'per_building' LPAREN function RPAREN         #PERBUILD
    #     | ...
    # ------------------------------------------------------------

    def enterUSEFROM(self, ident):
        self.write('useFrom("' + ident + '")')

    def enterPERBUILD(self):
        if self.alternativesContext:
            self.write(self.alterCommaStack[-1])
            self.alterCommaStack[-1] = ''
            self.write(self.indent() + 'PerBuilding(\n')
            self.indents += 1
        else:
            self.write('PerBuilding(')

    def exitPERBUILD(self):
        if self.alternativesContext:
            self.indents -= 1
            self.alterCommaStack[-1] = ',\n'
            self.write('\n' + self.indent() + ')')
        else:
            self.write(')')

    # ------------------------------------------------------------
    # function
    #     : ...
    #     | 'rgb' LPAREN NUMBER COMMA NUMBER COMMA NUMBER RPAREN      #RGB
    #     | 'rgba' LPAREN NUMBER COMMA NUMBER COMMA NUMBER COMMA NUMBER RPAREN      #RGBA
    #     | ...
    # ------------------------------------------------------------

    def enterRGB(self, rgb):
        expr = self.replaceColorsInText(rgb)
        if self.alternativesContext or self.conditionalContext:
            self.write(self.alterCommaStack[-1])
            self.write(self.indent() + "Constant(" + expr + ')')
            self.alterCommaStack[-1] = ',\n'
        else:
            self.write('Value(Constant(' + expr + ')')

    def enterRGBA(self, rgba):
        expr = self.replaceColorsInText(rgba)
        if self.alternativesContext or self.conditionalContext:
            self.write(self.alterCommaStack[-1])
            self.write(self.indent() + "Constant(" + expr + ')')
            self.alterCommaStack[-1] = ',\n'
        else:
            self.write('Value(Constant(' + expr + ')')

    # ------------------------------------------------------------
    # function
    #     : ...
    #     | constant                                      #CONST
    #     | nested_list                                   #NESTED
    #     | ...
    # ------------------------------------------------------------

    def enterCONST(self, text):
        text = self.replaceColorsInText(text)

        if self.conditionalContext:
            self.write(',\n')
            if self.smoothContext and text in ('smooth', 'flat', 'horizontal',
                                               'side', 'all'):
                expr = self.toCamelCase(text)
                self.write(self.indent() + "Constant(smoothness." + expr + ')')
            else:
                expr = self.literalize(text, False)
                self.write(self.indent() + "Constant(" + expr + ')')
        elif self.alternativesContext:
            self.write(self.alterCommaStack[-1])
            expr = self.literalize(text, False)
            self.write(self.indent() + "Constant(" + expr + ')')
            self.alterCommaStack[-1] = ',\n'
        else:
            expr = self.literalize(text, False)
            self.write(expr)

    def enterNESTED(self, li):
        li = self.replaceColorsInText(li)
        list = self.literalize(li, False)
        if self.alternativesContext:
            self.write(self.alterCommaStack[-1])
            self.write(self.indent() + list)
            self.alterCommaStack[-1] = ',\n'
        else:
            self.write(list)

    def exitINNESTED(self, li):
        list = self.literalize(li, True)
        self.write(list)

    # ------------------------------------------------------------
    # spec_conditions
    #     : LBRACK spec_condition RBRACK  #SPEC_LEVEL
    #     ;
    # spec_condition
    #     : '@roof'                   #SPEC_ROOF
    #     | '@all'                    #SPEC_ALL
    #     | NUMBER COLON NUMBER       #SPEC_FULL_INDX
    #     | NUMBER COLON              #SPEC_LEFT_INDX
    #     | COLON NUMBER              #SPEC_RIGHT_INDX
    #     ;
    # ------------------------------------------------------------

    def enterSPEC_LEVEL(self):
        self.spec_condition = []

    def exitSPEC_LEVEL(self):
        roof_val = 'True' if '@roof' in self.spec_condition else 'False'
        self.write(self.attribCommaStack[-1])
        self.write(self.indent() + 'roofLevels = ' + roof_val)
        self.attribCommaStack[-1] = ",\n"
        all_val = 'True' if '@all' in self.spec_condition else 'False'
        self.write(self.attribCommaStack[-1])
        self.write(self.indent() + 'allLevels  = ' + all_val)
        self.attribCommaStack[-1] = ",\n"
        self.spec_condition = []

    def enterSPEC_ROOF(self, cond):
        self.spec_condition.append(cond)

    def enterSPEC_FULL_INDX(self, index_text):
        indices = index_text.split(':')
        self.write(self.attribCommaStack[-1])
        self.write(self.indent() + 'indices = (' + indices[0] + ',' +
                   indices[1] + ')')
        self.attribCommaStack[-1] = ",\n"

    def enterSPEC_SINGLE(self, index_text):
        self.write(self.attribCommaStack[-1])
        self.write(self.indent() + 'indices = (' + index_text + ',' +
                   index_text + ')')
        self.attribCommaStack[-1] = ",\n"

    def enterSPEC_LEFT_INDX(self, index_text):
        indices = index_text.split(':')
        self.write(self.attribCommaStack[-1])
        self.write(self.indent() + 'indices = (' + indices[0] + ',-1)')
        self.attribCommaStack[-1] = ",\n"

    def enterSPEC_RIGHT_INDX(self, index_text):
        indices = index_text.split(':')
        self.write(self.attribCommaStack[-1])
        self.write(self.indent() + 'indices = (0,' + indices[1] + ')')
        self.attribCommaStack[-1] = ",\n"

    # ------------------------------------------------------------
    # arith_atom
    #     : 'item' '.' IDENTIFIER                                 # ATOM_SINGLE
    #     | 'item' '.' IDENTIFIER '.' IDENTIFIER                  # ATOM_SINGLE
    #     | 'item' '.' IDENTIFIER LBRACK STRING_LITERAL RBRACK    # ATOM_FROMATTR
    #     | 'item' LBRACK STRING_LITERAL RBRACK                   # ATOM_FROMATTR_SHORT
    #|    | 'style' '.' IDENTIFIER                                # ATOM_STYLE
    #     | identifier                                            # ATOM_IDENT
    #     | NUMBER                                                # ATOM_IDENT
    #     | STRING_LITERAL                                        # ATOM_IDENT
    # ------------------------------------------------------------

    def enterATOM_SINGLE(self, atom):
        self.write(atom)

    def enterATOM_FROMATTR(self, ident, literal):
        if self.conditionContext or self.conditionalContext:
            self.write('item.' + ident + '.getStyleBlockAttr(' + literal + ')')
        else:
            #            self.write(self.alterCommaStack[-1])
            identifier = ident.capitalize()
            self.write("FromStyleBlockAttr(" + literal +
                       ",FromStyleBlockAttr." + identifier + ")")
#           self.alterCommaStack[-1] = ",\n"

    def enterATOM_FROMATTR_SHORT(self, literal):
        if self.conditionContext:
            self.write('item.getStyleBlockAttr(' + literal + ')')
        else:
            self.write(self.alterCommaStack[-1])
            self.write(self.indent() + "FromStyleBlockAttr(" + literal + ")")
            self.alterCommaStack[-1] = ",\n"

    def enterATOM_STYLE(self, identifier):
        if self.conditionContext:
            self.write('self.' + identifier)
        else:
            self.write(self.alterCommaStack[-1])
            self.write(self.indent() + 'self.' + identifier)
            self.alterCommaStack[-1] = ",\n"

    def enterATOM_IDENT(self, ident):
        self.write(ident)

    def enterConst_atom(self, atom):
        const = self.literalize(atom, True)
        self.write(',\n' + self.indent() + "Constant(" + const + ')')

    # ------------------------------------------------------------
    #   ... and all the remaining details
    # ------------------------------------------------------------
    def enterDef_name(self, definition):
        self.write(self.attribCommaStack[-1])
        self.write(self.indent() + 'defName = "' + definition + '"')
        self.attribCommaStack[-1] = ",\n"

    def enterConstant(self, text):
        self.enterCONST(text)

    def enterSimple_expr(self, text):
        if self.smoothContext:
            return
        if text in ('true', 'false'):
            expr = text.capitalize()
        else:
            expr = self.replaceColorsInText(text)
            expr = self.literalize(expr, False)
        if self.alternativesContext or self.conditionalContext:  # ???or self.context  in ( "conditional" ):
            # self.write(self.alterCommaStack[-1])
            self.write(self.indent() + "Constant(" + expr + ')')
        else:
            self.write(expr)

    def enterAri_lparen(self):
        self.write(' (')

    def enterAri_rparen(self):
        self.write(') ')

    def enterIdentifier(self, ident):
        if self.smoothContext and ident in ('smooth', 'flat', 'horizontal',
                                            'side', 'all'):
            self.write('smoothness.' + ident)

    def enterInop(self, op):
        self.write(' ' + op + ' ')

    def enterRelop(self, op):
        self.write(' ' + op + ' ')

    def enterLogicop(self, op):
        self.write(' ' + op + ' ')

    def enterNotop(self, op):
        self.write(' ' + op + ' ')

    def enterArith_op(self, op):
        self.write(' ' + op + ' ')
Esempio n. 5
0
vocab = Vocab('%s/hbc/data/vocab' % (BASE_DIR))

dictMap = {}
i = 1
for line in open('%s/hbc/data/dictionaries' % (BASE_DIR)):
    dictionary = line.rstrip('\n')
    dictMap[i] = dictionary
    i += 1

dict2index = {}
for i in dictMap.keys():
    dict2index[dictMap[i]] = i

if llda:
    dictionaries = Dictionaries('%s/data/LabeledLDA_dictionaries3' % (BASE_DIR), dict2index)
entityMap = {}
i = 0
if llda:
    for line in open('%s/hbc/data/entities' % (BASE_DIR)):
        entity = line.rstrip('\n')
        entityMap[entity] = i
        i += 1

dict2label = {}
for line in open('%s/hbc/data/dict-label3' % (BASE_DIR)):
    (dictionary, label) = line.rstrip('\n').split(' ')
    dict2label[dictionary] = label


#ACCESS_TOKEN = '38812730-H9OzvfbIIRZQL7VsIBKJxeSjEwyUv9jzUcM9XLcgt'
        for line in open(eFile):
            entity = line.rstrip('\n')
            self.entityNumbers[entity] = i
            i += 1

    def GetID(self, entity):
        if self.entityNumbers.has_key(entity):
            return self.entityNumbers[entity]
        else:
            return -1


ts = TaggedSentence()
vocab = Vocab(sys.argv[2])
entityMap = EntityMap(sys.argv[3])
dictionaries = Dictionaries(sys.argv[4])

hbcOut = open('test.hbc', 'w')
entitiesOut = open('entities', 'w')
entityMapOut = open('entityMap', 'w')
goldOut = open('gold', 'w')
labelsOut = open('labels', 'w')
dictOut = open('dictionaries', 'w')
nInVocabOut = open('nInVocab', 'w')

for dictionary in dictionaries.dictionaries:
    dictOut.write('%s\n' % dictionary)

for line in open(sys.argv[1]):
    line = line.strip()
    fields = re.split(r'\s+', line)
Esempio n. 7
0
#!/usr/bin/python

import sys
sys.path.append('/homes/gws/aritter/twitter_nlp/python')
from twokenize import tokenize

from LdaFeatures import LdaFeatures
from Vocab import Vocab
from Dictionaries import Dictionaries

vocab = Vocab()
eOut = open('entities', 'w')
lOut = open('labels', 'w')
dOut = open('dictionaries', 'w')

dictionaries = Dictionaries(
    '/homes/gws/aritter/twitter_nlp/data/LabeledLDA_dictionaries2')

prevText = None
for line in sys.stdin:
    line = line.rstrip('\n')
    fields = line.split('\t')

    sid = fields[0]
    text = fields[6]
    words = tokenize(text)
    confidence = 1.0 / float(fields[-1])
    eType = fields[-2]
    entity = fields[-3]
    neTags = fields[-4].split(' ')
    pos = fields[-5].split(' ')
    words = fields[-6].split(' ')
class EntityExtractor:
    def __init__(self, pos, chunk, event, classify, mallet_memory='256m'):
        self.clear_line_counter()
        
        self.posTagger = pos_tagger_stdin.PosTagger() if pos else None
        self.chunkTagger = chunk_tagger_stdin.ChunkTagger() if chunk and pos else None
        self.eventTagger = event_tagger_stdin.EventTagger() if event and pos else None
        self.llda = GetLLda() if classify else None
        
        if pos and chunk:
            self.ner_model = 'ner.model'
        elif pos:
            self.ner_model = 'ner_nochunk.model'
        else:
            self.ner_model = 'ner_nopos_nochunk.model'
        
        self.ner = GetNer(self.ner_model, memory=mallet_memory)
        self.fe = Features.FeatureExtractor('%s/data/dictionaries' % (BASE_DIR))
        self.capClassifier = cap_classifier.CapClassifier()
        self.vocab = Vocab('%s/hbc/data/vocab' % (BASE_DIR))
        
        self.dictMap = {}
        self.dictMap = self.dictMap
        i = 1
        for line in open('%s/hbc/data/dictionaries' % (BASE_DIR)):
            dictionary = line.rstrip('\n')
            self.dictMap[i] = dictionary
            i += 1
        
        self.dict2index = {}
        for i in self.dictMap.keys():
            self.dict2index[self.dictMap[i]] = i
        
        if self.llda:
            self.dictionaries = Dictionaries('%s/data/LabeledLDA_dictionaries3' % (BASE_DIR), self.dict2index)
        self.entityMap = {}
        i = 0
        if self.llda:
            for line in open('%s/hbc/data/entities' % (BASE_DIR)):
                entity = line.rstrip('\n')
                self.entityMap[entity] = i
                i += 1
        
        self.dict2label = {}
        for line in open('%s/hbc/data/dict-label3' % (BASE_DIR)):
            (dictionary, label) = line.rstrip('\n').split(' ')
            self.dict2label[dictionary] = label
    
    def clear_line_counter(self):
        self.nlines = 0
    
    def trigger_line_counter(self):
        self.ner.stdin.close()
        self.ner.stdout.close()
        os.kill(self.ner.pid, SIGTERM)  # Need to do this for python 2.4
        self.ner.wait()
        self.ner = GetNer(self.ner_model)
    
    def line_counter(self):
        self.nlines += 1
        if self.nlines % 50000 == 0:
            self.trigger_line_counter()
            self.clear_line_counter()
    
    def parse_lines(self, lines):
        res = []
        for line in lines:
            # nLines = 1
            line = line.encode('utf-8', "ignore")
            words = twokenize.tokenize(line)
            seq_features = []
            tags = []
            
            goodCap = self.capClassifier.Classify(words) > 0.9
            
            if self.posTagger:
                pos = self.posTagger.TagSentence(words)
                pos = [re.sub(r':[^:]*$', '', p) for p in pos]  # remove weights
            else:
                pos = None
            
            # Chunking the tweet
            if self.posTagger and self.chunkTagger:
                word_pos = zip(words, [p.split(':')[0] for p in pos])
                chunk = self.chunkTagger.TagSentence(word_pos)
                chunk = [c.split(':')[0] for c in chunk]  # remove weights
            else:
                chunk = None
            
            # Event tags
            if self.posTagger and self.eventTagger:
                events = self.eventTagger.TagSentence(words, [p.split(':')[0] for p in pos])
                events = [e.split(':')[0] for e in events]
            else:
                events = None
            
            quotes = Features.GetQuotes(words)
            for i in range(len(words)):
                features = self.fe.Extract(words, pos, chunk, i, goodCap) + ['DOMAIN=Twitter']
                if quotes[i]:
                    features.append("QUOTED")
                seq_features.append(" ".join(features))
            self.ner.stdin.write(("\t".join(seq_features) + "\n").encode('utf8'))
            
            for i in range(len(words)):
                tags.append(self.ner.stdout.readline().rstrip('\n').strip(' '))
            features = LdaFeatures(words, tags)
            
            # Extract and classify entities
            for i in range(len(features.entities)):
                # type = None
                wids = [str(self.vocab.GetID(x.lower())) for x in features.features[i] if self.vocab.HasWord(x.lower())]
                if self.llda and len(wids) > 0:
                    entityid = "-1"
                    if self.entityMap.has_key(features.entityStrings[i].lower()):
                        entityid = str(self.entityMap[features.entityStrings[i].lower()])
                    labels = self.dictionaries.GetDictVector(features.entityStrings[i])
                    
                    if sum(labels) == 0:
                        labels = [1 for _ in labels]
                    self.llda.stdin.write("\t".join([entityid, " ".join(wids), " ".join([str(x) for x in labels])]) + "\n")
                    sample = self.llda.stdout.readline().rstrip('\n')
                    labels = [self.dict2label[self.dictMap[int(x)]] for x in sample[4:len(sample) - 8].split(' ')]
                    
                    count = {}
                    for label in labels:
                        count[label] = count.get(label, 0.0) + 1.0
                    maxL = None
                    maxP = 0.0
                    for label in count.keys():
                        p = count[label] / float(len(count))
                        if p > maxP or maxL == None:
                            maxL = label
                            maxP = p
                    
                    if maxL != 'None':
                        tags[features.entities[i][0]] = "B-%s" % (maxL)
                        for j in range(features.entities[i][0] + 1, features.entities[i][1]):
                            tags[j] = "I-%s" % (maxL)
                    else:
                        tags[features.entities[i][0]] = "O"
                        for j in range(features.entities[i][0] + 1, features.entities[i][1]):
                            tags[j] = "O"
                else:
                    tags[features.entities[i][0]] = "B-ENTITY"
                    for j in range(features.entities[i][0] + 1, features.entities[i][1]):
                        tags[j] = "I-ENTITY"
            
            output = ["%s/%s" % (words[x], tags[x]) for x in range(len(words))]
            if pos:
                output = ["%s/%s" % (output[x], pos[x]) for x in range(len(output))]
            if chunk:
                output = ["%s/%s" % (output[x], chunk[x]) for x in range(len(output))]
            if events:
                output = ["%s/%s" % (output[x], events[x]) for x in range(len(output))]
            res.append(" ".join(output))
            
            # seems like there is a memory leak comming from mallet, so just restart it every 1,000 tweets or so
            # if nLines % 10000 == 0:
            #     self.trigger_line_counter()
            self.line_counter()
            
        return res
    
    def close(self):
        self.ner.stdin.close()
        self.ner.stdout.close()
        self.ner.terminate()
        self.ner.wait()
        self.llda.stdin.close()
        self.llda.stdout.close()
        self.llda.terminate()
        self.llda.wait()
        del self.ner, self.llda
        del self.dict2index, self.dict2label, self.dictionaries, self.dictMap, self.entityMap
        del self.chunkTagger, self.eventTagger, self.posTagger
        del self.capClassifier, self.fe, self.vocab
vocab = Vocab('%s/hbc/data/vocab' % (BASE_DIR))

dictMap = {}
i = 1
for line in open('%s/hbc/data/dictionaries' % (BASE_DIR)):
    dictionary = line.rstrip('\n')
    dictMap[i] = dictionary
    i += 1

dict2index = {}
for i in dictMap.keys():
    dict2index[dictMap[i]] = i

if llda:
    dictionaries = Dictionaries(
        '%s/data/LabeledLDA_dictionaries3' % (BASE_DIR), dict2index)
entityMap = {}
i = 0
if llda:
    for line in open('%s/hbc/data/entities' % (BASE_DIR)):
        entity = line.rstrip('\n')
        entityMap[entity] = i
        i += 1

dict2label = {}
for line in open('%s/hbc/data/dict-label3' % (BASE_DIR)):
    (dictionary, label) = line.rstrip('\n').split(' ')
    dict2label[dictionary] = label

print >> sys.stderr, "Finished loading all models. Now waiting for files in %s and writing to %s" % (
    INPUT_DIR, OUTPUT_DIR)
Esempio n. 10
0
class RitterTagger:
    def __init__(self):
        self.numberLines = 0
        self.eventTagger = None
        self.posTagger = None
        self.chunkTagger = None
        self.llda = GetLLda()
        self.ner = GetNer('ner_nopos_nochunk.model')
        self.fe = Features.FeatureExtractor('%s/data/dictionaries' %
                                            (BASE_DIR))
        self.capClassifier = cap_classifier.CapClassifier()

        self.vocab = Vocab('%s/hbc/data/vocab' % (BASE_DIR))

        self.dictMap = {}
        i = 1
        for line in open('%s/hbc/data/dictionaries' % (BASE_DIR)):
            dictionary = line.rstrip('\n')
            self.dictMap[i] = dictionary
            i += 1

        dict2index = {}
        for i in self.dictMap.keys():
            dict2index[self.dictMap[i]] = i

        if self.llda:
            self.dictionaries = Dictionaries(
                '%s/data/LabeledLDA_dictionaries3' % (BASE_DIR), dict2index)
        self.entityMap = {}
        i = 0
        if self.llda:
            for line in open('%s/hbc/data/entities' % (BASE_DIR)):
                entity = line.rstrip('\n')
                self.entityMap[entity] = i
                i += 1

        self.dict2label = {}
        for line in open('%s/hbc/data/dict-label3' % (BASE_DIR)):
            (dictionary, label) = line.rstrip('\n').split(' ')
            self.dict2label[dictionary] = label

    def process_tweet(self, line):

        self.numberLines = self.numberLines + 1

        if self.numberLines == 1000:
            self.numberLines = 0
            self.ner.stdin.close()
            self.ner.stdout.close()
            os.kill(self.ner.pid, SIGTERM)  #Need to do this for python 2.4
            self.ner.wait()
            self.ner = GetNer('ner_nopos_nochunk.model')

        words = twokenize.tokenize(line)
        seq_features = []
        tags = []

        goodCap = self.capClassifier.Classify(words) > 0.9

        if self.posTagger:
            pos = self.posTagger.TagSentence(words)
            #pos = [p.split(':')[0] for p in pos]  # remove weights
            pos = [re.sub(r':[^:]*$', '', p) for p in pos]  # remove weights
        else:
            pos = None

    # Chunking the tweet
        if self.posTagger and self.chunkTagger:
            word_pos = zip(words, [p.split(':')[0] for p in pos])
            chunk = self.chunkTagger.TagSentence(word_pos)
            chunk = [c.split(':')[0] for c in chunk]  # remove weights
        else:
            chunk = None

    #Event tags
        if self.posTagger and self.eventTagger:
            events = self.eventTagger.TagSentence(
                words, [p.split(':')[0] for p in pos])
            events = [e.split(':')[0] for e in events]
        else:
            events = None

        quotes = Features.GetQuotes(words)
        for i in range(len(words)):
            features = self.fe.Extract(words, pos, chunk, i,
                                       goodCap) + ['DOMAIN=Twitter']
            if quotes[i]:
                features.append("QUOTED")
            seq_features.append(" ".join(features))
        self.ner.stdin.write(("\t".join(seq_features) + "\n").encode('utf8'))

        for i in range(len(words)):
            tags.append(self.ner.stdout.readline().rstrip('\n').strip(' '))

        features = LdaFeatures(words, tags)

        #Extract and classify entities
        for i in range(len(features.entities)):
            type = None
            wids = [
                str(self.vocab.GetID(x.lower())) for x in features.features[i]
                if self.vocab.HasWord(x.lower())
            ]
            if self.llda and len(wids) > 0:
                entityid = "-1"
                if self.entityMap.has_key(features.entityStrings[i].lower()):
                    entityid = str(
                        self.entityMap[features.entityStrings[i].lower()])
                labels = self.dictionaries.GetDictVector(
                    features.entityStrings[i])

                if sum(labels) == 0:
                    labels = [1 for x in labels]
                self.llda.stdin.write("\t".join([
                    entityid, " ".join(wids), " ".join(
                        [str(x) for x in labels])
                ]) + "\n")
                sample = self.llda.stdout.readline().rstrip('\n')
                labels = [
                    self.dict2label[self.dictMap[int(x)]]
                    for x in sample[4:len(sample) - 8].split(' ')
                ]

                count = {}
                for label in labels:
                    count[label] = count.get(label, 0.0) + 1.0
                maxL = None
                maxP = 0.0
                for label in count.keys():
                    p = count[label] / float(len(count))
                    if p > maxP or maxL == None:
                        maxL = label
                        maxP = p

                if maxL != 'None':
                    tags[features.entities[i][0]] = "B-%s" % (maxL)
                    for j in range(features.entities[i][0] + 1,
                                   features.entities[i][1]):
                        tags[j] = "I-%s" % (maxL)
                else:
                    tags[features.entities[i][0]] = "O"
                    for j in range(features.entities[i][0] + 1,
                                   features.entities[i][1]):
                        tags[j] = "O"
            else:
                tags[features.entities[i][0]] = "B-ENTITY"
                for j in range(features.entities[i][0] + 1,
                               features.entities[i][1]):
                    tags[j] = "I-ENTITY"

        output = " ".join("%s/%s" % (words[x], tags[x])
                          for x in range(len(words)))
        if pos:
            output = " ".join("%s/%s" % (output[x], pos[x])
                              for x in range(len(output)))
        if chunk:
            output = " ".join("%s/%s" % (output[x], chunk[x])
                              for x in range(len(output)))
        if events:
            output = " ".join("%s/%s" % (output[x], events[x])
                              for x in range(len(output)))

        return output
Esempio n. 11
0
from optparse import OptionParser
import random

from LdaFeatures import LdaFeatures
from Vocab import Vocab
from Dictionaries import Dictionaries

parser = OptionParser()
parser.add_option("--noWords", action="store_false", dest="useWords", default=True)
parser.add_option("--useNoDict", action="store_true", dest="useNoDict", default=False)
parser.add_option("--maxWords", dest="maxWords", default=500)
parser.add_option("--maxEntities", dest="maxEntities", default=None)
parser.add_option("--dictDir", dest="dictDir", default='/homes/gws/aritter/twitter_nlp/data/LabeledLDA_dictionaries')
(options, args) = parser.parse_args()

dictionaries = Dictionaries(options.dictDir)
vocab = Vocab()
eOut = open('entities', 'w')
lOut = open('labels', 'w')
dOut = open('dictionaries', 'w')

stop_list = set()
for word in open('/homes/gws/aritter/twitter_nlp/data/dictionaries/english.stop'):
    word = word.rstrip('\n')
    stop_list.add(word)

fNoDictOut = None
fNoDictOutLabels = None
fNoDictOutEntities = None
if not options.useNoDict:
    fNoDictOut = open('noDictOut', 'w')