コード例 #1
0
ファイル: Base.py プロジェクト: derAndreas/pyGtk3Docs
 def parse_doc(self):
     """ Parse a <doc> node in the ast.* instance if there is any """
     doc = self.node.find(self.ns(self.NS_CORE, 'doc'))
     
     if doc is not None:
         # need to import Doc here, because of nested usage of Base class,
         # which causes infinite import loop
         from Doc import Doc
         self.doc = Doc(self._namespace, doc)
コード例 #2
0
def load_docs(docs_filename,numOfDoc,vocab,model):
    cnt=0;
    docs=[None]*numOfDoc;
    len_sum=0;
    for line in open(docs_filename):
        doc=Doc(line,vocab);
        doc.init_varational_parameters(vocab,model);
        len_sum+=len(doc);
        docs[cnt]=doc;
        if cnt%1000==0:
            print "progress:",cnt,"memoery useage:",resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1000,"time:",datetime.now();
        cnt+=1;
        if cnt>=numOfDoc:
            break;
    print "ave length of doc:",float(len_sum)/cnt;
    return docs;
コード例 #3
0
def load_docs(docs_filename, numOfDoc, vocab, model):
    cnt = 0
    docs = [None] * numOfDoc
    len_sum = 0
    for line in open(docs_filename):
        doc = Doc(line, vocab)
        doc.init_varational_parameters(vocab, model)
        len_sum += len(doc)
        docs[cnt] = doc
        if cnt % 1000 == 0:
            print "progress:", cnt, "memoery useage:", resource.getrusage(
                resource.RUSAGE_SELF).ru_maxrss / 1000, "time:", datetime.now(
                )
        cnt += 1
        if cnt >= numOfDoc:
            break
    print "ave length of doc:", float(len_sum) / cnt
    return docs
コード例 #4
0
    def __init__(self, docs, gram):
        self.index = {}
        self.docs = docs
        self.gram = gram
        if self.gram == 2:
            all_words = unique([inner
                         for outer in docs
                            for inner in docs[outer].words])
            new_docs = {}
            for i in range(len(all_words)):
                word = "#"+all_words[i]+"#"
                id = i
                words = [word[j:j+2] for j in range(len(word) - 1)]
                doc = Doc(id, ' '.join(words))
                doc.words = words
                new_docs[id] = doc
            self.docs = new_docs

        self.create_index()
コード例 #5
0
def extract_csv(filename):
    docs = {}
    df = pd.read_csv(filename)
    for i in range(df.shape[0]):
        text = df['Title'].values[i] + " " + df['Text'].values[i]
        if "Tag" in df.columns:
            tag = df['Tag'].values[i]
        else:
            tag = None
        docs[i] = Doc(i, text, tag)
    return docs
コード例 #6
0
ファイル: corpus.py プロジェクト: bamine/LDA
 def __init__(self,filename):
     self.docs = []
     self.vocab_size = 0
     self.n_docs = 0
     print "reading data from : "+filename
     f=open(filename)
     for line in f.readlines():
         parts=line.split()
         doc=Doc()
         for i,part in enumerate(parts):
             if i==0:
                 doc.length=int(part)
             else:
                 word,count=map(int,part.split(":"))
                 doc.words.append(word)
                 doc.word_counts.append(count)
                 doc.total+=count
                 if(word>=self.vocab_size):
                     self.vocab_size=word+1
     f.close()
     self.n_docs=len(self.docs)
コード例 #7
0
def extract_xml(filename):
    tree = ET.parse(filename)
    root = tree.getroot()
    docs = {}
    for page in root.findall(
            "{http://www.mediawiki.org/xml/export-0.10/}page"):
        id = int(
            page.find("{http://www.mediawiki.org/xml/export-0.10/}id").text)
        text = page.find("{http://www.mediawiki.org/xml/export-0.10/}revision") \
            .find("{http://www.mediawiki.org/xml/export-0.10/}text").text
        docs[id] = Doc(id, text)
    return docs
コード例 #8
0
def list_all_entities():
    """POST request, generate a response listing the details of all the entities.

    Args:
        raw text: the user should post an article (raw text format) to this api.

    Returns:
        json: list the details of the entities in the article, in the json format
    """

    article = request.data.decode()
    my_doc = Doc(article)
    dic = []
    mapping = my_doc.map_position_start_index()

    for ent in my_doc.get_doc().ents:
        ent_dic = {}
        start_index = ent.start_char
        position = my_doc.get_position(start_index, mapping)
        label = my_doc.get_label(ent)

        ent_dic["entity"] = ent.text
        ent_dic["position"] = position
        ent_dic["label"] = label

        dic.append(ent_dic)
    return jsonify(dic)
コード例 #9
0
    def saveDoc(doc: Doc):
        """Сохранение данных"""

        # Сдесь почему-то не получилось сохранять
        # Обьект Doc целиком
        # Поэтому превращаю словарь в список и сохраняю в файл
        result = []
        for key, figure in doc.all().items():
            result.append(figure)

        output = open(Data.__file_name, 'wb')
        pickle.dump(result, output)
        output.close()
コード例 #10
0
def visualization():
    """ POST request, can be made via Postman.

    Notes:
        the format of the body of the POST request must be 'raw text'.

    Args:
        raw text: the body of the POST request should be raw text.
    Returns:
        String: Return a URL containing the reference number of this POST, the user
        can use the URL in the browser to see the visualized entity extract result
    """
    article = request.data.decode()  # String

    my_doc = Doc(article)
    reference_number = str(uuid.uuid4())

    html = displacy.render(my_doc.get_doc(), style="ent")
    catch.add_reference(reference_number, html)

    return jsonify({
        'your reference':
        f"http://{request.host}/get?reference={reference_number}"
    })
コード例 #11
0
ファイル: Banco.py プロジェクト: Edu93Reis/aulaPoo-Python
def banco():
    c1 = Cliente("Eduardo", 1200.00, 5000.00)
    c2 = Cliente("Carlos", 2000.00, 900.00)

    print("Telas saques: ")
    c1.Sacar(6200.00)
    print("Saldo do cliente: ", c1.getNome(), " é: ", c1.checarSaldo())
    c2.Sacar(100.0)
    print("Saldo do cliente, ", c2.getNome(), " é: ", c2.checarSaldo())
    c2.Sacar(3000.0)

    d1 = Doc()

    print(
        "------------------------ Tela Docs: -------------------------------")
    print("Saldo inicial do cliente, ", c1.getNome(), ", é: ",
          c1.checarSaldo())
    print("Saldo inicial do cliente, ", c2.getNome(), ", é: ",
          c2.checarSaldo())
    print(
        "------------------------ Transferência 1: --------------------------")
    d1.transferir(c1, c2, 300.00)
    print("Saldo atualizado do cliente, ", c1.getNome(), ", é: ",
          c1.checarSaldo())
    print("Saldo atualizado do cliente, ", c2.getNome(), ", é: ",
          c2.checarSaldo())
    print(
        "------------------------ Transferência 2: --------------------------")
    d1.transferir(c1, c2, 30000.00)
    print("Saldo atualizado do cliente, ", c1.getNome(), ", é: ",
          c1.checarSaldo())
    print("Saldo atualizado do cliente, ", c2.getNome(), ", é: ",
          c2.checarSaldo())
    print(
        "------------------------ Transferência 3: --------------------------")
    d1.transferir(c1, c2, 5900.00)
    print("Saldo atualizado do cliente, ", c1.getNome(), ", é: ",
          c1.checarSaldo())
    print("Saldo atualizado do cliente, ", c2.getNome(), ", é: ",
          c2.checarSaldo())
コード例 #12
0
ファイル: Input.py プロジェクト: krahulreddy/GSoC_POCs
def fetch_and_create_doc(connection, name="Monaco"):
    print("=================================================================")
    print("Trying to fetch row and create doc")

    sql = "SELECT place_id, osm_id, osm_type, name, address, \
country_code, housenumber, postcode from placex \
where name->'name' like '" + name + "' limit 1 "
    cursor = connection.cursor(cursor_factory=RealDictCursor)
    cursor.execute(sql)
    record = cursor.fetchone()
    print(sql, "\n")

    # place_id, osm_id, osm_type, name, address, country_code, housenumber, \
    #     postcode = record.values()
    doc = Doc(record)

    print("osm_id:", doc.osm_id)
    print("osm_type:", doc.osm_type)
    print("name tags as dictionary:", doc.name)
    cursor.close()
    return doc
コード例 #13
0
ファイル: Base.py プロジェクト: derAndreas/pyGtk3Docs
class Base(object):
    """ Base Class all ast.* Nodes should be derived from"""
    NS_CORE = NS_CORE
    NS_C = NS_C
    NS_GLIB = NS_GLIB
    
    def __init__(self, namespace, node):
        """ Init a new Ast element, store the parent global namespace
        instance and etree XML Node and call parse_node(). """
        
        self._namespace = namespace #needed later for type resolvment
        self.node = node
        self.parse_node()
        
    def parse_node(self):
        """ Parse the current Node in the derived AST Object
        Overwrite this method in the concrete implementation """
        raise NotImplementedError('"Ast::%s::parse_node()" not implemented in derived class' % self.__class__.__name__)
        
    def toObjectRepr(self):
        """ Collect all parsed informations and return
        a python object that can be processed later for output 
        Overwrite this method in the concrete implementation """
        raise NotImplementedError('"Ast::%s::toObjectRepr()" not implemented in derived class' % self.__class__.__name__)
    
    def parse_doc(self):
        """ Parse a <doc> node in the ast.* instance if there is any """
        doc = self.node.find(self.ns(self.NS_CORE, 'doc'))
        
        if doc is not None:
            # need to import Doc here, because of nested usage of Base class,
            # which causes infinite import loop
            from Doc import Doc
            self.doc = Doc(self._namespace, doc)
    
    def parse_parameters(self):
        """ Parse <parameters> node and each subnode <parameter> """
        if hasattr(self, 'parameters') is False:
            self.parameters = []
        
        params = self.node.find(self.ns(self.NS_CORE, 'parameters'))
        
        if params is not None:
            from Parameter import Parameter
            for param in self.find_children(self.ns(self.NS_CORE, 'parameter'), params):
                self.parameters.append(Parameter(self._namespace, param))
    
    def parse_attributes_from_map(self, mapping):
        """ Universal method to parse attributed from a node based
        on the given parse map. The mapping is a python Dict that looks like
        
            mapping = {
                'key_to_set_in_ast_instance' : 'name_of_the_xml_attribute'
            }
        
        The key of the mapping Dic will be the property/attribute name of 
        the ast.* instance. If a mapping is {'myCoolVersion' : 'version'} the
        ast instance will have the propertiy self.myCoolVersion
        """
        
        for key, attrib in mapping.iteritems():
            if isinstance(attrib, tuple):
                attrib = self.ns(attrib[0], attrib[1])
            
            self.__dict__[key] = self.node.get(attrib)
            
    def parse_returnvalue(self):
        """ Parse the <return-value> node """
        rValue = self.node.find(self.ns(self.NS_CORE, 'return-value'))
        if rValue is not None:
            # need to import here, because of import loop
            from ReturnValue import ReturnValue
            self.returnValue = ReturnValue(self._namespace, rValue)
            
    def parse_types(self, node):
        """ Method to lookup types used as parameters or return values 
        Lookup the type defined in TypeDef.py and return the Type* instance
        that represents it 
        
        Parameters:
            node -- etree XML node to get and parse the type for
        
        Returns
            Returns an instance of ast.Type* based on the type that the 
            input node describes
        
        """
        if node is None:
            return node
            
        from Type import Type
        from TypeArray import TypeArray
        from TypeVarArg import TypeVarArg
    
        typenode = node.find(self.ns(self.NS_CORE, 'type'))

        if typenode is not None:
            return Type(self._namespace, typenode)
        
        arraynode = node.find(self.ns(self.NS_CORE, 'array'))
        
        if arraynode is not None:
            return TypeArray(self._namespace, arraynode)
        
        varnode = node.find(self.ns(self.NS_CORE, 'varargs'))
        
        if varnode is not None:
            return TypeVarArg(self._namespace, varnode)
        

    def ns(self, ns, tag):
        """ Universal function to generate NS string for XML usage
        
            Arguments:
            ns  -- The namespace string
            tag -- The tag name 
            
            Returns:
            The namespace string as "{%ns}%tag"
        """
        return "{%s}%s" % (ns, tag)
        
    def find_children(self, tag, node=None):
        """ Find direct children of the input etree node or 
        the self.node etree object and return all children that
        match the tag name
        
        Arguments:
            tag  -- Name of the tag to find as direct child (string)
            node -- an etree() node, if not present it will use self.node
        
        Returns:
            An array of etree node objects
        """
        
        if node is None:
            node = self.node
            
        return [c for c in node.getchildren() if c.tag == tag]


    def getNamepspaceTag(self):
        """ Return the global main ast.Namespace.Namespace Object
        
        Returns
            Instance of ast.Namespace.Namespace
        """
        return self._namespace
        
    def getName(self):
        """ Return the value of self.name which should 
        be set in every ast instance
        
        Returns:
            String
        """
        assert self.name
        return self.name
        
    def getVersion(self):
        """ Return the self.version value if present
        If not present, return an empty string
        
        Returns:
            String
        """
        if self.version is None:
            return ''
        return self.version
        
    def getType(self):
        """ Return ast.Type* instance, if there is any type object
        This will only availbe in Parameters, returnValue (etc) instances
        where a <type> node exists
        
        Returns
            None or ast.Type* instance
        """
        return self.type
    
    def getCType(self):
        """ Return the Ctype, which is the attribute "c:type" from the XML node
        
        Returns
            None or String
        """
        if hasattr(self, 'ctype'):
            return self.ctype
        return None
    
    def getDoc(self, asObjectRepr=False):
        """ Get the <doc> ast.Doc instance for this ast instance if there is any
        If called with Argument asObjectRepr=True it will directly call
        toObjectRepr() on the ast.Doc instance. 
        
        Arguments:
            asObjectRepr -- true if directly call toObjectRepr() on ast.Doc instance (default False)
        
        Returns
            String or ast.Doc instance
        """
        if hasattr(self, 'doc'):
            if asObjectRepr is True:
                return self.doc.toObjectRepr()
            else:
                return self.doc
        
        return ''
        
    def getParentTree(self, result=None):
        """Get the Parent Tree for the current node if possible.
        Maybe only available for <class> nodes and derived ast.Klass implementations
        
        Returns:
            None or an sorted array with parents
        """
        if self._parent is None:
            return None
            
        if result is None:
            result = [self.name]
        else:
            result.append(self.name)
        
        if isinstance(self._parent, Base):
            self._parent.getParentTree(result)
        elif isinstance(self._parent, str):
            result.append(self._parent)
        else:
            raise ValueException('Unknown Type found in getParentTree()')
        
        return result
コード例 #14
0
ファイル: testVSP.py プロジェクト: adhaka/playground
def main():

	parser = argparse.ArgumentParser()

	parser.add_argument('--order', type=int)
	args = parser.parse_args()
	foodList1 = ['hot', 'chocolate', 'milk', 'taco', 'sandwich', 'sushi', 'schnitzel', 'naan', 'croissant', 'bulle']
	vsp = VSP.VectorSpaceModel()
	vsp.loadDict(foodList1)

	ordered = False
	if args.order == 1:
		ordered=True

	doc1str = ' hot chocolate milk sandwich'
	doc1 = Doc(doc1str)
	doc1.vectorify(vsp)
	doc1Vec = doc1.getVector()
	doc2str = 'hot milk sandwich sushi naan'
	doc2 = Doc(doc2str)
	doc2.vectorify(vsp)
	doc2Vec = doc2.getVector()
	query1str = 'milk hot sushi'
	query1 = Doc(query1str)
	query1.vectorify(vsp, ordered=ordered)
	query1Vec = query1.getVector()
	score1 = vsp.dotProduct(query1Vec, doc1Vec)
	score2 = vsp.dotProduct(query1Vec, doc2Vec)
	# score3 = vsp.dotProduct(query1Vec, doc3Vec)
	query2str = 'hot milk sushi'
	query2 = Doc(query2str)
	query2.vectorify(vsp, ordered=ordered)
	query2Vec = query2.getVector()
	score21 = vsp.dotProduct(query2Vec, doc1Vec)
	score22 = vsp.dotProduct(query2Vec, doc2Vec)

	# print doc1Vec, doc2Vec
	# print query1Vec, query2Vec
	print "Dictionary:", foodList1
	print "Document 1:", doc1str
	print "Document 2:", doc2str
	print "Query:", query1str
	print "Score for doc1:", score1
	print "Score for doc2:", score2

	print "Query 2:", query2str
	print "Score for doc1:", score21
	print "Score for doc2:", score22

	scoreMap_Q1 = {}
	scoreMap_Q2 = {}

	# the key-map pair below can be stored in a persistance layer for a fast lookup ... 
	scoreMap_Q1[doc1.getDocNum()] = score1
	scoreMap_Q1[doc2.getDocNum()] = score2

	scoreMap_Q1_sorted = sortScoresWithOrder(scoreMap_Q1, desc=True)
	scoreMap_Q2_sorted = sortScoresWithOrder(scoreMap_Q2)

	# print scoreMap_Q1_sorted
	print "For query1- score sheet:"
	for k,v in scoreMap_Q1_sorted:
		print('DocNumber: %i, score:%5.4f' %(k,v))
コード例 #15
0
ファイル: query.py プロジェクト: arakhsha/MIR_Project
        docs = read_docs('../data/English.csv')
        preprocessor = EnglishPreprocessor(docs)
    else:
        docs = read_docs('../data/Persian.xml')
        preprocessor = PersianPreprocessor(docs)

    for doc in docs.values():
        doc.words = preprocessor.preprocess(doc.text)

    print("Preprocess is done!")

    index = PositionalIndexer(docs, 1).index
    print("Index Created Successfully!")

    query = input("Enter Query: ")
    q_doc = Doc(0, query)
    q_doc.words = preprocessor.preprocess(q_doc.text)

    query_tag = input("Enter Tag (1, 2, 3, 4, None): ")
    tag = None
    if query_tag in ["1", "2", "3", "4"]:
        tag = int(query_tag)

    if tag is not None:
        classify(docs)

    results = search(q_doc, docs, index, 10, query_tag)
    for result in results:
        print(result[1])
        print(result[0].text)
        print()
コード例 #16
0
ファイル: ECBParser.py プロジェクト: chriswtanner/CRETE
    def parseCorpus(self, docToVerifiedSentences):

        # maps the long, original REF names to a small, more readable REF ID
        REFToUREF = {}
        UREF = 0

        print("* parsing ECB corpus:", self.args.corpusPath)
        numMentionsIgnored = 0
        corpus = Corpus()
        files = []

        filteredTrainingDirs = self.helper.trainingDirs[0:self.args.devDir]
        print("filteredTrainingDirs:", filteredTrainingDirs)
        for root, _, filenames in os.walk(self.args.corpusPath):
            for filename in fnmatch.filter(filenames, '*.xml'):
                f = os.path.join(root, filename)
                doc_id = f[f.rfind("/") + 1:]
                dir_num = int(doc_id.split("_")[0])
                if dir_num in self.helper.trainingDirs and dir_num not in filteredTrainingDirs:
                    continue
                files.append(os.path.join(root, filename))

        globalSentenceNum = 0
        lastToken_id = -1
        intraCount = 0
        
        # used for keeping track of how many mentions were pronouns
        had_pronoun = 0
        not_had_pronoun = 0
        num_events_with_pronouns = 0
        for f in sorted(files):
            lm_idToMention = {} # only used to tmp store the mentions
            removed_m_ids = set() # keeps track of the mentions that had pronouns and we removed (if we care to remove them)
            doc_id = f[f.rfind("/") + 1:]
            dir_num = int(doc_id.split("_")[0])
            extension = doc_id[doc_id.find("ecb"):]
            dirHalf = str(dir_num) + extension

            curDoc = Doc(doc_id)
            corpus.ECBDirs[dir_num].docs[doc_id] = curDoc
            corpus.dirHalves[dirHalf].docs[doc_id] = curDoc
            tmpDocTokens = []
            tmpDocTokenIDsToTokens = {}

            # opens the xml file and makes needed replacements
            input_file = open(f, 'r', encoding="utf-8")
            #with open(f, 'r', encoding="utf-8") as myfile:
            fileContents = input_file.read().replace('\n', ' ')            
            for badToken in self.replacementsList:  # self.replacementsSet:
                fileContents = fileContents.replace(badToken, self.replacements[badToken])

            # reads <tokens>
            it = tuple(re.finditer(r"<token t\_id=\"(\d+)\" sentence=\"(\d+)\" number=\"(\d+)\".*?>(.*?)</(.*?)>", fileContents))
            lastSentenceNum = -1

            if self.write_stanford_input:
                tmp_line_to_stanford_input = defaultdict(list)

            # numbers every token in each given sentence, starting at 1 (each sentence starts at 1)
            tokenNum = 0
            firstToken = True
            lastTokenText = ""

            for match in it:
                t_id = match.group(1)
                sentenceNum = int(match.group(2))
                hTokenNum = int(match.group(3))  # only used for matching w/ HDDCRP's files
                
                #tokenText = match.group(4).rstrip() # should be used if i'll write out hte corpus for Stan
                tokenText = match.group(4).lower().rstrip()
                # removes tokens that end in : (e.g., newspaper:) but leaves the atomic ":" alone
                if len(tokenText) > 1 and tokenText[-1] == ":":
                    tokenText = tokenText[:-1]
                if tokenText == "''":
                    tokenText = "\""
                elif tokenText == "''bagman\"":
                    tokenText = "\"bagman\""
                    print("* replaced bagman1")
                elif tokenText == "''bagman":
                    tokenText = "\"bagman"
                    print("* replaced bagman2")
    
                if sentenceNum > curDoc.highestSentenceNum:
                    curDoc.highestSentenceNum = sentenceNum
                
                if sentenceNum > 0 or "plus" not in doc_id:
                    
                    # writes Stanford_input
                    if self.write_stanford_input:
                        tmp_line_to_stanford_input[int(sentenceNum)].append(match.group(4).rstrip())

                    hSentenceNum = sentenceNum
                    if "plus" in doc_id:
                        hSentenceNum = sentenceNum - 1

                    # TMP
                    '''
                    if sentenceNum not in tmpSentenceNums:
                        tmpSentenceNums.append(sentenceNum)
                    '''

                    # we are starting a new sentence
                    if sentenceNum != lastSentenceNum:
                        # we are possibly ending the prev sentence
                        if not firstToken:
                            # if sentence ended with an atomic ":", let's change it to a "."
                            if lastTokenText == ":":
                                lastToken = tmpDocTokenIDsToTokens[lastToken_id]
                                lastToken.text = "."
                                tmpDocTokenIDsToTokens[lastToken_id] = lastToken
                            elif lastTokenText not in self.endPunctuation:
                                endToken = Token("-1", lastSentenceNum, globalSentenceNum, tokenNum, doc_id, hSentenceNum, hTokenNum, ".")
                                tmpDocTokens.append(endToken)

                            globalSentenceNum = globalSentenceNum + 1

                        tokenNum = 0
                    # adds token
                    curToken = Token(t_id, sentenceNum, globalSentenceNum, tokenNum, doc_id, hSentenceNum, hTokenNum, tokenText)
                    #corpus.UIDToToken[curToken.UID] = curToken
                    #curDoc.UIDs.append(curToken.UID)
                    tmpDocTokenIDsToTokens[t_id] = curToken

                    firstToken = False
                    tmpDocTokens.append(curToken)
                    tokenNum = tokenNum + 1
                    curDoc.globalSentenceNums.add(globalSentenceNum)
                lastSentenceNum = sentenceNum
                lastTokenText = tokenText
                lastToken_id = t_id

            if self.write_stanford_input:
                tmpFOUT = open("../data/stanford_in/"+doc_id, "w")
                for sent_num in sorted(tmp_line_to_stanford_input.keys()):
                    tmpFOUT.write(" ".join(tmp_line_to_stanford_input[sent_num]) + "\n")
                tmpFOUT.close()

            # if sentence ended with an atomic ":", let's change it to a "."
            if lastTokenText == ":":
                lastToken = tmpDocTokenIDsToTokens[lastToken_id]
                lastToken.text = "."
                tmpDocTokenIDsToTokens[lastToken_id] = lastToken
            elif lastTokenText not in self.endPunctuation:
                endToken = Token("-1", lastSentenceNum, globalSentenceNum, tokenNum, doc_id, -1, -1, ".")
                tmpDocTokens.append(endToken)

            globalSentenceNum = globalSentenceNum + 1

            # reads <markables> 1st time
            regex = r"<([\w]+) m_id=\"(\d+)?\".*?>(.*?)?</.*?>"
            markables = fileContents[fileContents.find("<Markables>")+11:fileContents.find("</Markables>")]
            it = tuple(re.finditer(regex, markables))
            for match in it:
                # gets the token IDs
                regex2 = r"<token_anchor t_id=\"(\d+)\".*?/>"
                it2 = tuple(re.finditer(regex2, match.group(3)))
                tmpCurrentMentionSpanIDs = []
                hasAllTokens = True
                for match2 in it2:
                    tokenID = match2.group(1)
                    tmpCurrentMentionSpanIDs.append(int(tokenID))
                    if tokenID not in tmpDocTokenIDsToTokens.keys():
                        hasAllTokens = False

            for t in tmpDocTokens:
                corpus.addToken(t)
                curDoc.tokens.append(t)
                corpus.UIDToToken[t.UID] = t

                #if doc_id == "31_3ecbplus.xml":
                #    print("t:",t)
                
            # reads <markables> 2nd time
            regex = r"<([\w]+) m_id=\"(\d+)?\".*?>(.*?)?</.*?>"
            markables = fileContents[fileContents.find("<Markables>")+11:fileContents.find("</Markables>")]
            it = tuple(re.finditer(regex, markables))
            for match in it:
                isPred = False
                mentionType = match.group(1)
                if "ACTION" in mentionType:
                    isPred = True
                m_id = int(match.group(2))

                # gets the token IDs
                regex2 = r"<token_anchor t_id=\"(\d+)\".*?/>"
                it2 = tuple(re.finditer(regex2, match.group(3)))
                tmpTokens = []
                text = []
                hasAllTokens = True

                has_pronoun = False
                for match2 in it2:
                    tokenID = match2.group(1)
                    if tokenID in tmpDocTokenIDsToTokens.keys():
                        cur_token = tmpDocTokenIDsToTokens[tokenID]
                        tmpTokens.append(cur_token)
                        text.append(cur_token.text)

                    else:
                        hasAllTokens = False

                # only process Mentions if they adhere to our preferences of using pronouns or not
                # determines if it has a pronoun or not (and if we care)
                if len(text) == 1:
                    if text[0] in self.helper.pronouns:
                        has_pronoun = True

                if has_pronoun:
                    had_pronoun += 1
                    if isPred:
                        num_events_with_pronouns += 1
                else:
                    not_had_pronoun += 1

                # possibly add the mention 
                use_pronoun = False
                if isPred:
                    use_pronoun = self.helper.event_pronouns
                else:
                    use_pronoun = self.helper.entity_pronouns
                
                use_mention = True
                if not use_pronoun and has_pronoun:
                    use_mention = False
                    #print("* not constructing mention:", text)
                    removed_m_ids.add(m_id)

                # we should only have incomplete Mentions for our hand-curated, sample corpus,
                # for we do not want to have all mentions, so we curtail the sentences of tokens
                if hasAllTokens and use_mention:
                    curMention = Mention(dirHalf, dir_num, doc_id, tmpTokens, text, isPred, mentionType)
                    lm_idToMention[m_id] = curMention
                    #tmpSentenceNumToMentions[tmpTokens[0].sentenceNum].append(curMention)
                    #corpus.addMention(curMention, "123")
            # reads <relations>
            relations = fileContents[fileContents.find("<Relations>"):fileContents.find("</Relations>")]
            regex = r"<CROSS_DOC_COREF.*?note=\"(.+?)\".*?>(.*?)?</.*?>"
            it = tuple(re.finditer(regex, relations))
            for match in it:
                REF = match.group(1)
                regex2 = r"<source m_id=\"(\d+)\".*?/>"
                it2 = tuple(re.finditer(regex2, match.group(2)))
                # only keep track of REFs for which we have found Mentions
                for match2 in it2:
                    m_id = int(match2.group(1))
                    if m_id not in lm_idToMention:
                        
                        if  m_id not in removed_m_ids:
                            print("*** MISSING MENTION! EXITING 1")
                            exit(1)
                    else: #elif lm_idToMention[m_id].isPred:
                        foundMention = lm_idToMention[m_id]
                        if self.onlyEvents and not foundMention.isPred:
                            continue
                        token0 = foundMention.tokens[0]

                        if self.args.onlyValidSentences and token0.sentenceNum not in docToVerifiedSentences[doc_id]:
                            numMentionsIgnored += 1
                            continue
                        else:
                            corpus.addMention(foundMention, REF)

            if self.args.addIntraDocs:
                regex = r"<INTRA_DOC_COREF.*?>(.*?)?</.*?>"
                it = tuple(re.finditer(regex, relations))
                for match in it:
                    regex2 = r"<source m_id=\"(\d+)\".*?/>"
                    it2 = tuple(re.finditer(regex2, match.group(1)))
                    # only keep track of REFs for which we have found Mentions
                    for match2 in it2:
                        m_id = int(match2.group(1))
                        if m_id not in lm_idToMention:
                            print("*** MISSING MENTION! EXITING 2")
                            exit(1)
                        else:
                            foundMention = lm_idToMention[m_id]
                            if self.onlyEvents and not foundMention.isPred:
                                continue
                            token0 = foundMention.tokens[0]

                            if self.args.onlyValidSentences and token0.sentenceNum not in docToVerifiedSentences[doc_id]:
                                numMentionsIgnored += 1
                                continue
                            else:
                                corpus.addMention(foundMention, "INTRA"+str(intraCount))
                                intraCount += 1
            corpus.addDocPointer(doc_id, curDoc)

            # optionally displays annotations (Mentions clearly designated w/ unique REF IDs#)
            if self.printCorpusTokens:
                print("\n------------------\ndoc:",doc_id,"\n------------------")
                sent_num = -1
                oline = ""
                lastMentions = set()
                for t in curDoc.tokens:
                    if t.sentenceNum != sent_num and sent_num != -1:
                        sent_num = t.sentenceNum
                        print(oline)
                        oline = ""
                    added = False
                    removed = False
                    urefToAdd = -1
                    entOrEventToAdd = ""
                    for m in t.mentions:
                        if m not in lastMentions:
                            if m.REF in REFToUREF.keys():
                                urefToAdd = REFToUREF[m.REF]
                            else:
                                urefToAdd = UREF
                                REFToUREF[m.REF] = UREF
                                UREF += 1
                            if m.isPred:
                                entOrEventToAdd = "v"
                            else:
                                entOrEventToAdd = "ent"
                            added = True
                    
                    if len(lastMentions) > 0:
                        for m in lastMentions:
                            if m not in t.mentions:
                                removed = True
                    if removed:
                        oline += "] "
                    if added:
                        if len(oline) > 0 and oline[-1] != " ":
                            oline += " "
                        oline += str(entOrEventToAdd) + str(urefToAdd) + "["
                    if len(oline) > 0 and oline[-1] != " " and oline[-1] != "[":
                        oline += " "
                    oline += str(t.text)
                    lastMentions = t.mentions
                print(oline)
        corpus.assignGlobalSentenceNums()
        print("numMentionsIgnored:", numMentionsIgnored)
        print("# ECB mentions created:", len(corpus.ecb_mentions))
        num_events = 0
        for m in corpus.ecb_mentions:
            if m.isPred:
                num_events += 1
        print("\t# events:", num_events)
        print("\t\t# of event which had pronouns:", num_events_with_pronouns)
        print("\t# entities:", len(corpus.ecb_mentions) - num_events)
        print("# ECB+ tokens:", len(corpus.corpusTokens))
        print("# mentions that had_pronoun:", had_pronoun)
        print("# mentions that did not had_pronoun:", not_had_pronoun)

        return corpus