def get_annotated_comments(self): dct = {} for comment_key in self.comments.keys(): comments_val = self.comments[comment_key] tokens_comments = [TokenHandler().word_tokenizer(comment[1]) for comment in comments_val] if comment_key not in dct.keys(): dct[comment_key] = [] for tokens_comment in tokens_comments: result = '' ctr_token = 0 for token in tokens_comment: if token in self.kms.keys(): result += "{}/{} ".format(token, self.kms[token]) else: if self.normalised: prev_token = tokens_comment[ctr_token-1] if ctr_token > 0 else "<s>" next_token = token normalised_token = EDBNormaliser((prev_token, next_token), self.kms, self.cpd_pickled)\ .normalise_token() if normalised_token is None: result += "{}/{} ".format(next_token, 'UNK') else: result += self.get_token_polarity(normalised_token) else: result += "{}/{} ".format(token, 'UNK') ctr_token += 1 dct[comment_key].append(result) print(result) # result = [["{}/{}".format(w, (self.kms[w] if w in self.kms.keys() else 'UNK')) for w # in tokens_comment] for tokens_comment in tokens_comments] return self.comments, dct
def test_handleStartTagTokenHandlesFirstStartTag(): testToken = StartTagToken("html") th = TokenHandler("TestRootUrl/") assert len(th.elementTreeRoot.children) == 0 th.handleStartTagToken(testToken) assert len(th.elementTreeRoot.children) == 1 assert th.elementTreeRoot.children[0].name == "html"
def parseHTML(self, htmlString): """ parseHTML converts the raw htmlString into a list of tokens. """ tsm = TokenizerStateMachine() tokenHandler = TokenHandler() tokenHandler.rootUrl = self.rootUrl i = 0 while (i < len(htmlString)): i = i + tsm.handleCharacter(htmlString[i]) if (not (tsm.currentEmittedToken == None)): if (isinstance(tsm.currentEmittedToken, StartTagToken) and tsm.currentEmittedToken.name == "link"): self.handleLinkToken(tsm.currentEmittedToken) else: tokenHandler.processToken(tsm.currentEmittedToken) tsm.currentEmittedToken = None #self.extractParagraphText(tsm.tokens) self.strList.clear() self.renderList.clear() self.renderObjects.clear() #tokenHandler.getRenderList(tokenHandler.elementTreeRoot, self.renderObjects) #for r in self.renderObjects: # print(r.text) # if (not(r.fontSize == None)): # print(r.fontSize) #print(tokenHandler.elementTreeRoot.getElementRepresentationString("")) #tokenHandler.getTextElements(tokenHandler.elementTreeRoot, self.strList) self.fillRenderList(tokenHandler.elementTreeRoot, self.renderList) for s in self.renderList: print(s)
def convertTokenListToHTMLElementTree(self, url, tokenList): # The root url is needed by the TokenHandler for handling # possible relative links rootUrl = self.extractRootUrl(url) print(rootUrl) tokenHandler = TokenHandler(rootUrl) for token in tokenList: tokenHandler.processToken(token) #print(tokenHandler.elementTreeRoot.getElementRepresentationString("")) return tokenHandler.elementTreeRoot
def test_getAbsoluteUrl(): rootUrl = "TestRootUrl/" th = TokenHandler(rootUrl) testUrls = [ "http://google.com", "https://i.ytimg.com/vi/nrIDL7h9MFQ/hqdefault.jpg?sqp=-oaymwEYCNIBEHZIVfKriqkDCwgBFQAAiEIYAXAB&rs=AOn4CLByrwt1ptJWI5zGkLOZhJpyrFeCSw", "ARelativeUrl", "/RelativeWithSlash" ] assert th.getAbsoluteUrl(testUrls[0]) == "http://google.com" assert th.getAbsoluteUrl( testUrls[1] ) == "https://i.ytimg.com/vi/nrIDL7h9MFQ/hqdefault.jpg?sqp=-oaymwEYCNIBEHZIVfKriqkDCwgBFQAAiEIYAXAB&rs=AOn4CLByrwt1ptJWI5zGkLOZhJpyrFeCSw" assert th.getAbsoluteUrl(testUrls[2]) == rootUrl + "ARelativeUrl" assert th.getAbsoluteUrl(testUrls[3]) == rootUrl + "RelativeWithSlash"
def test_processTokenHandlesClosingTags(): testTokens = [ StartTagToken("p"), EndTagToken("p"), StartTagToken("a"), EndTagToken("a") ] th = TokenHandler("TestRootUrl/") for token in testTokens: th.processToken(token) assert th.elementTreeRoot.name == "#root#" assert len(th.elementTreeRoot.children) == 2 assert th.elementTreeRoot.children[0].name == "p" assert th.elementTreeRoot.children[1].name == "a"
def __init__(self) -> None: ''' Initializes the program and sets class variables that are going to be used as the initial values across the program.\n Required command line arguements:\n GitHub URL (https://github.com/{Username}/{Repository})\n Optional command line arguements:\n GitHub Personal Access Token ''' self.args = sys.argv[1:] # All of the command line args excluding the filename self.githubURL = None self.githubUser = None self.githubRepo = None self.githubToken = None self.githubTokenList = None # This is pulled from keys.txt self.dbCursor = None # Database specific variable self.dbConnection = None # Database specific variables self.th = TokenHandler() # Class instance to write and read tokens to tokens.txt
def test_processTokenProcessesIndependentTags(): testTokens = [ StartTagToken("br"), StartTagToken("img"), StartTagToken("html"), EndTagToken("html") ] testTokens[1].isSelfClosing = True th = TokenHandler("TestRootUrl/") for token in testTokens: th.processToken(token) assert len(th.elementTreeRoot.children) == 3 assert th.elementTreeRoot.children[0].name == "br" assert th.elementTreeRoot.children[1].name == "img" assert th.elementTreeRoot.children[2].name == "html"
def test_processTokenHandlesUnacceptableTags(): testTokens = [ StartTagToken("p"), StartTagToken("NoHTMLTagShouldEverHaveThisName"), StartTagToken("body"), StartTagToken("a"), EndTagToken("a"), EndTagToken("body"), EndTagToken("NoHTMLTagShouldEverHaveThisName"), EndTagToken("p") ] th = TokenHandler("TestRootUrl/") for token in testTokens: th.processToken(token) assert len(th.elementTreeRoot.children) == 1 assert th.elementTreeRoot.children[0].name == "p"
def get_political_news_comment(self): conn = self.create_connection() with conn: cur = conn.cursor() sql = "select comment_id, comment from t_comments where comment_id " \ "in (select news_id from t_news where news_category = nc2 " \ "and news_category = 1)" cur.execute(sql) data = cur.fetchall() cur.close() comments = {} ctr_cmmnt = 0 for content in data: comment_id = content[0].strip() comment = ' '.join(TokenHandler().word_tokenizer( content[1].strip().lower())) if comment_id not in comments.keys(): comments[comment_id] = [] comments[comment_id].append((ctr_cmmnt, comment)) else: comments[comment_id].append((ctr_cmmnt, comment)) ctr_cmmnt += 1 return comments
resulting_file = input('File name as an output: ') # Read configuration configs = {} f = open('System.conf', 'r') for line in f.readlines(): cfg = line.strip().split('=') configs[cfg[0].strip()] = cfg[1].strip() # Get political news comments comments = NewsHandler(configs['dbnews']).get_political_news_comment() # Handle bigram model first to ease next computation f = open('political_comment_corpus.txt', 'r') corpus = TokenHandler().word_tokenizer(f.readline()) lm = bigrams(corpus) # MLE only -> change to Lidstone smoothing # cfd_pickled = pickle.dumps(nltk.ConditionalFreqDist(lm)) # cpd = ConditionalProbDist(pickle.loads(cfd_pickled), MLEProbDist) # cpd_pickled = pickle.dumps(cpd) # Lidstone smoothing cfd_pickled = pickle.dumps(nltk.ConditionalFreqDist(lm)) lidstone_estimator = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1) cpd = ConditionalProbDist(pickle.loads(cfd_pickled), lidstone_estimator) # Annotate each comment before_annotated, after_annotated = PolarityAnnotator(comments, configs['dbkbbi_cleaned_offline'], cpd, normalised=isNormalised)\ .get_annotated_comments()
f.write(grammar.grammar.start_symbol + "\n") f.write(view) directive_handler.code_generator.print_code() @staticmethod def empty_files(file_out, file_error): with(open(file=file_out, mode="w")): pass with(open(file=file_error, mode="w")): pass DEFAULT_FILE_IN_NAME = "scanner.txt" DEFAULT_FILE_OUT_NAME = "parsetree.txt" DEFAULT_FILE_ERROR_NAME = "error.txt" c_lexical_dfa = CLexicalDFA.make_c_lexical_dfa() not_printing_tokens = [CTokenType.WHITE_SPACE, CTokenType.COMMENT] c_token_handler = TokenHandler(c_lexical_dfa, not_printing_tokens) grammar = LL1Grammar(Grammar.make_grammar(compressed_grammar)) # for prod in grammar.grammar.prods: # print("{}->{}".format(grammar.grammar.prods[prod].non_terminal, grammar.grammar.prods[prod].rhses)) # # for f in grammar.first_sets["simple-expression"]: # print(str(f)) # print(grammar.grammar.compress()) parser = Parser(grammar) parse_handler = ParserHandler(parser) compiler = Compiler(c_token_handler, parse_handler) compiler.compile(DEFAULT_FILE_IN_NAME, DEFAULT_FILE_OUT_NAME, DEFAULT_FILE_ERROR_NAME)
def test_isRelativeUrl(): th = TokenHandler("TestRootUrl/") testUrls = ["http://google.com", "ARelativeUrl", "/RelativeWithSlash"] assert th.isRelativeUrl(testUrls[0]) == False assert th.isRelativeUrl(testUrls[1]) == True assert th.isRelativeUrl(testUrls[2]) == True