Ejemplos de ZPar en Python, ejemplos de zpar.ZPar en Python

Ejemplo n.º 1

0

Mostrar archivo

def getgoldevents():
    zparModel = ZPar('english-models')
    #tagger = zparModel.get_tagger()
    depparser = zparModel.get_depparser()
    #stemmer = PorterStemmer()
    wordnet_lemmatizer = WordNetLemmatizer()

    gevents = file("../ni_data/event_descriptions.tsv").readlines()
    gevents = [line.strip().split("\t")[1].strip("\"") for line in gevents]
    gold_events = []
    for line in gevents:
        parsed_sent = depparser.dep_parse_sentence(line)
        items = parsed_sent.strip().split("\n")
        items = [item.strip().split("\t") for item in items]
        words = [item[0] for item in items]
        tags = [item[1].lower() for item in items]
        links = [int(item[2]) for item in items]
        deps = [item[3].lower() for item in items]

        valid_words = [
            words[idx] for idx, tag in enumerate(tags)
            if tag[:2] in ["nn", "vb", "jj", "cd", "rb"] if deps[idx] in
            ["root", "sub", "obj", "vc", "vmod", "nmod", "pmod"]
        ]
        #stemmed_words = [stemmer.stem(word.lower()) for word in valid_words if word not in ["is", "are", "a", "an", "be", "had", "ha"]]
        stemmed_words = [
            wordnet_lemmatizer.lemmatize(word.lower()) for word in valid_words
            if word not in ["is", "are", "a", "an", "be", "had", "ha"]
        ]
        print "-gold", stemmed_words
        gold_events.append(list(set(stemmed_words)))
    return gold_events

Ejemplo n.º 2

0

Mostrar archivo

Archivo: zpar_server.py Proyecto: BambangDW/python-zpar

    def __init__(self, addr, zpar_model_path, model_list, *args, **kwds):

        # store the hostname and port number
        self.myhost, self.myport = addr

        # store the link to the loaded zpar object
        self.z = ZPar(zpar_model_path)

        # initialize the parent class
        _baseclass.__init__(self, addr, *args, **kwds)

        # Call the individual loading functions
        # and only register the appropriate methods
        if 'tagger' in model_list:
            tagger = self.z.get_tagger()
            self.register_function(tagger.tag_sentence)
            self.register_function(tagger.tag_file)
        if 'parser' in model_list:
            parser = self.z.get_parser()
            self.register_function(parser.parse_sentence)
            self.register_function(parser.parse_file)
        if 'depparser' in model_list:
            parser = self.z.get_depparser()
            self.register_function(parser.dep_parse_sentence)
            self.register_function(parser.dep_parse_file)

        # register the function to remotely stop the server
        self.register_function(self.stop_server)

        self.quit = False

Ejemplo n.º 3

0

Mostrar archivo

    def __init__(self, zpar_model_directory=None, hostname=None, port=None):
        """
        Initialize the parser wrapper.

        Parameters
        ----------
        zpar_model_directory : str, optional
            The path to the directory containing the ZPar constituency model.
        hostname : str, optional
            The name of the machine on which the ZPar server is running, if any.
        port : int, optional
            The port at which the ZPar server is running, if any.

        Raises
        ------
        OSError
            If ZPar couldn't be loaded.
        """
        self.zpar_model_directory = zpar_model_directory
        if self.zpar_model_directory is None:
            self.zpar_model_directory = os.getenv("ZPAR_MODEL_DIR",
                                                  "zpar/english")

        # TODO: allow pre-tokenized input
        self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        self._zpar_proxy = None
        self._zpar_ref = None

        # if a port is specified, then we want to use the server
        if port:

            # if no hostname was specified, then try the local machine
            hostname = "localhost" if hostname is None else hostname
            logging.info(f"Connecting to zpar server at {hostname}:{port} ...")

            # see if a server actually exists
            connected, server_proxy = self._get_rpc(hostname, port)
            if connected:
                self._zpar_proxy = server_proxy
            else:
                logging.warning('Could not connect to zpar server.')

        # otherwise, we want to use the python zpar module
        else:

            logging.info("Trying to locate zpar shared library ...")
            try:
                # Create a zpar wrapper data structure
                z = ZPar(self.zpar_model_directory)
                self._zpar_ref = z.get_parser()
            except OSError as e:
                logging.warning("Could not load zpar via python-zpar."
                                "Did you set `ZPAR_MODEL_DIR` correctly?")
                raise e

Ejemplo n.º 4

0

Mostrar archivo

def setUp():
    """
    set up things we need for the tests
    """
    global z, tagger

    assert 'ZPAR_MODEL_DIR' in os.environ

    model_dir = os.environ['ZPAR_MODEL_DIR']

    z = ZPar(model_dir)
    tagger = z.get_tagger()

Ejemplo n.º 5

0

Mostrar archivo

Archivo: test_depparser.py Proyecto: XiaoWanLINJU/python-zpar

def setUp():
    """
    set up things we need for the tests
    """
    global z, depparser

    assert 'ZPAR_MODEL_DIR' in os.environ

    model_dir = os.environ['ZPAR_MODEL_DIR']

    z = ZPar(model_dir)
    depparser = z.get_depparser()

Ejemplo n.º 6

0

Mostrar archivo

Archivo: zpar_server.py Proyecto: shannonyu/python-zpar

class StoppableServer(_baseclass):

    allow_reuse_address = True

    def __init__(self, addr, zpar_model_path, model_list, *args, **kwds):

        # store the hostname and port number
        self.myhost, self.myport = addr

        # store the link to the loaded zpar object
        self.z = ZPar(zpar_model_path)

        # initialize the parent class
        _baseclass.__init__(self, addr, *args, **kwds)

        # Call the individual loading functions
        # and only register the appropriate methods
        if 'tagger' in model_list:
            tagger = self.z.get_tagger()
            self.register_function(tagger.tag_sentence)
            self.register_function(tagger.tag_file)
        if 'parser' in model_list:
            parser = self.z.get_parser()
            self.register_function(parser.parse_sentence)
            self.register_function(parser.parse_file)
            self.register_function(parser.parse_tagged_sentence)
            self.register_function(parser.parse_tagged_file)
        if 'depparser' in model_list:
            parser = self.z.get_depparser()
            self.register_function(parser.dep_parse_sentence)
            self.register_function(parser.dep_parse_file)
            self.register_function(parser.dep_parse_tagged_sentence)
            self.register_function(parser.dep_parse_tagged_file)

        # register the function to remotely stop the server
        self.register_function(self.stop_server)

        self.quit = False

    def serve_forever(self):
        while not self.quit:
            try:
                self.handle_request()
            except KeyboardInterrupt:
                print("\nKeyboard interrupt received, exiting.")
                break
        self.z.close()
        self.server_close()

    def stop_server(self):
        self.quit = True
        return 0, "Server terminated on host %r, port %r" % (self.myhost, self.myport)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: zpar_server.py Proyecto: mheilman/python-zpar

    def __init__(self, addr, zpar_model_path, model_list, *args, **kwds):

        # store the hostname and port number
        self.myhost, self.myport = addr

        # store the link to the loaded zpar object
        self.z = ZPar(zpar_model_path)

        # initialize the parent class
        _baseclass.__init__(self, addr, *args, **kwds)

        # Call the individual loading functions
        # and only register the appropriate methods
        if 'tagger' in model_list:
            tagger = self.z.get_tagger()
            self.register_function(tagger.tag_sentence)
            self.register_function(tagger.tag_file)
        if 'parser' in model_list:
            parser = self.z.get_parser()
            self.register_function(parser.parse_sentence)
            self.register_function(parser.parse_file)
        if 'depparser' in model_list:
            parser = self.z.get_depparser()
            self.register_function(parser.dep_parse_sentence)
            self.register_function(parser.dep_parse_file)

        # register the function to remotely stop the server
        self.register_function(self.stop_server)

        self.quit = False

Ejemplo n.º 8

0

Mostrar archivo

def main():
    # set up an argument parser
    parser = argparse.ArgumentParser(prog='zpar_example.py')
    parser.add_argument(
        '--modeldir',
        dest='modeldir',
        help="Path to directory containing zpar English models",
        required=True)

    # parse given command line arguments
    args = parser.parse_args()

    # use the zpar wrapper as a context manager
    with ZPar(args.modeldir) as z:

        # get the parser and the dependency parser models
        tagger = z.get_tagger()
        depparser = z.get_depparser()

        # tag a sentence
        tagged_sent = tagger.tag_sentence("I am going to the market.")
        print_(tagged_sent)

        # tag an already tokenized sentence
        tagged_sent = tagger.tag_sentence(
            "Do n't you want to come with me to the market ?", tokenize=False)
        print_(tagged_sent)

        # get the dependency parse of an already tagged sentence
        dep_parsed_sent = depparser.dep_parse_tagged_sentence(
            "I/PRP am/VBP going/VBG to/TO the/DT market/NN ./.")
        print_(dep_parsed_sent)

        # get the dependency parse of an already tokenized sentence
        dep_parsed_sent = depparser.dep_parse_sentence(
            "Do n't you want to come with me to the market ?", tokenize=False)
        print_(dep_parsed_sent)

        # get the dependency parse of an already tokenized sentence
        # and include lemma information (assuming you have NLTK as well
        # as its WordNet corpus installed)
        dep_parsed_sent = depparser.dep_parse_sentence(
            "Do n't you want to come with me to the market ?",
            tokenize=False,
            with_lemmas=True)
        print_(dep_parsed_sent)

        # compute POS tags for all sentences in "test.txt"
        # and write the output to "test.tag". Note that the
        # file contains a single sentence per line.
        # The sentences need not be word tokenized
        test_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                 'test.txt')
        tagger.tag_file(test_file, "test.tag")

        # compute dependency parses for all sentences in "test_tokenized.txt"
        tokenized_test_file = os.path.join(
            os.path.dirname(os.path.abspath(__file__)), 'test_tokenized.txt')
        depparser.dep_parse_file(tokenized_test_file, "test.dep")

Ejemplo n.º 9

0

Mostrar archivo

Archivo: parse_util.py Proyecto: hanguantianxia/discourse-parsing

    def __init__(self, zpar_model_directory=None, hostname=None,
                 port=None):
        self.zpar_model_directory = zpar_model_directory
        if self.zpar_model_directory is None:
            self.zpar_model_directory = os.getenv('ZPAR_MODEL_DIR',
                                                  'zpar/english')

        self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        self._zpar_proxy = None
        self._zpar_ref = None

        # if a port is specified, then we want to use the server
        if port:

            # if no hostname was specified, then try the local machine
            hostname = 'localhost' if not hostname else hostname
            logging.info('Trying to connect to zpar server at {}:{} ...'
                         .format(hostname, port))

            # try to see if a server actually exists
            connected, server_proxy = self._get_rpc(hostname, port)
            if connected:
                self._zpar_proxy = server_proxy
            else:
                logging.warning('Could not connect to zpar server')

        # otherwise, we want to use the python zpar module
        else:

            logging.info('Trying to locate zpar shared library ...')

            # get the path to the zpar shared library via the environment
            # variable
            zpar_library_dir = os.getenv('ZPAR_LIBRARY_DIR', '')
            zpar_library_path = os.path.join(zpar_library_dir, 'zpar.so')

            try:
                # Create a zpar wrapper data structure
                z = ZPar(self.zpar_model_directory)
                self._zpar_ref = z.get_parser()
            except OSError as e:
                logging.warning('Could not load zpar via python-zpar. ' +
                                'Did you set ZPAR_LIBRARY_DIR correctly?' + 
                                'Did you set ZPAR_MODEL_DIR correctly?')
                raise e

Ejemplo n.º 10

0

Mostrar archivo

def read_data_use(option, sen2id):

    file_name = option.use_data_path
    max_length = option.num_steps
    dict_size = option.dict_size
    time1 = time.time()
    Rake = RAKE.Rake(RAKE.SmartStopList())
    z = ZPar(option.pos_path)
    tagger = z.get_tagger()
    time2 = time.time()
    print("read data load time: ", time2 - time1)
    with open(file_name) as f:
        data = []
        vector = []
        sta_vec_list = []
        j = 0
        for line in f:
            if len(line.strip().split()) > 15:
                line = ' '.join(line.strip().split()[:15])
            sta_vec = list(np.zeros([option.num_steps - 1]))
            keyword = Rake.run(line.strip())
            pos_list = tagger.tag_sentence(line.strip()).split()
            pos = list(zip(*[x.split('/') for x in pos_list]))[0]
            # pos=list(zip(*[x.split('/') for x in pos_list]))[0]
            if keyword != []:
                keyword = list(list(zip(*keyword))[0])
                keyword_new = []
                linewords = line.strip().split()
                for i in range(len(linewords)):
                    for item in keyword:
                        length11 = len(item.split())
                        if ' '.join(linewords[i:i + length11]) == item:
                            keyword_new.extend(
                                [i + k for k in range(length11)])
                for i in range(len(keyword_new)):
                    ind = keyword_new[i]
                    if ind <= option.num_steps - 2:
                        sta_vec[ind] = 1
            if option.keyword_pos == True:
                sta_vec_list.append(keyword_pos2sta_vec(option, sta_vec, pos))
            else:
                sta_vec_list.append(list(np.zeros([option.num_steps - 1])))
            data.append(sen2id(line.strip().lower().split()))
    data_new = array_data(data, max_length, dict_size)
    return data_new, sta_vec_list  # sentence, keyvector

Ejemplo n.º 11

0

Mostrar archivo

Archivo: utils.py Proyecto: Liuxg16/SAparaphrase

def read_data_use1(option, sen2id):

    file_name = option.use_data_path
    max_length = option.num_steps
    dict_size = option.dict_size
    Rake = RAKE.Rake(RAKE.SmartStopList())
    z = ZPar(option.pos_path)
    tagger = z.get_tagger()
    with open(file_name) as f:
        data = []
        vector = []
        sta_vec_list = []
        j = 0
        for line in f:
            print('sentence:' + line)
            sta_vec = list(np.zeros([option.num_steps - 1]))
            keyword = Rake.run(line.strip())
            pos_list = tagger.tag_sentence(line.strip()).split()
            # pos=zip(*[x.split('/') for x in pos_list])[0]
            pos = list(zip(*[x.split('/') for x in pos_list]))[0]
            print(keyword)
            if keyword != []:
                keyword = list(list(zip(*keyword))[0])
                keyword_new = []
                for item in keyword:
                    tem1 = [
                        line.strip().split().index(x) for x in item.split()
                        if x in line.strip().split()
                    ]
                    print('id', tem1)
                    keyword_new.extend(tem1)
                print(keyword_new)
                for i in range(len(keyword_new)):
                    ind = keyword_new[i]
                    if ind <= option.num_steps - 2:
                        sta_vec[ind] = 1
            if option.keyword_pos == True:
                sta_vec_list.append(keyword_pos2sta_vec(option, sta_vec, pos))
            else:
                sta_vec_list.append(list(np.zeros([option.num_steps - 1])))
            print(keyword_pos2sta_vec(option, sta_vec, pos))
            data.append(sen2id(line.strip().lower().split()))
    data_new = array_data(data, max_length, dict_size)
    return data_new, sta_vec_list  # sentence, keyvector

Ejemplo n.º 12

0

Mostrar archivo

Archivo: run_zpar.py Proyecto: yofayed/spacy-benchmarks

def main(giga_db_loc, n_docs, pos_tag=False, parse=False):
    docs = Gigaword(giga_db_loc, limit=n_docs)
    sbd = nltk.data.load('tokenizers/punkt/english.pickle')
    from zpar import ZPar
    n = 0
    with ZPar('models/zpar') as z:
        tagger = z.get_tagger()
        if parse:
            parser = z.get_depparser()
        for doc in docs:
            sentences = sbd.tokenize(doc)
            for sent in sentences:
                if parse:
                    dep_parsed_sent = parser.dep_parse_sentence(sent)
                elif pos_tag:
                    tags = parser.tag_sentence(sent)
                n += len(sent)
    print n

Ejemplo n.º 13

0

Mostrar archivo

from zpar import ZPar

if __name__ == '__main__':
    # set up an argument parser
    parser = argparse.ArgumentParser(prog='zpar_example.py')
    parser.add_argument(
        '--modeldir',
        dest='modeldir',
        help="Path to directory containing zpar English models",
        required=True)

    # parse given command line arguments
    args = parser.parse_args()

    # use the zpar wrapper as a context manager
    with ZPar(args.modeldir) as z:

        # get the parser and the dependency parser models
        tagger = z.get_tagger()
        depparser = z.get_depparser()

        # tag a sentence
        tagged_sent = tagger.tag_sentence("I am going to the market.")
        print_(tagged_sent)

        # tag an already tokenized sentence
        tagged_sent = tagger.tag_sentence(
            "Do n't you want to come with me to the market ?", tokenize=False)
        print_(tagged_sent)

        # get the dependency parse of an already tagged sentence

Ejemplo n.º 14

0

Mostrar archivo

Archivo: parse.py Proyecto: wangqi1996/DSS-VAE-pytorch

from six import print_
from zpar import ZPar

# use the zpar wrapper as a context manager
with ZPar('english-models') as z:
    # get the parser and the dependency parser models
    tagger = z.get_tagger()
    depparser = z.get_depparser()
    parser = z.get_parser()

    # tag a sentence
    tagged_sent = tagger.tag_sentence("I am going to the market.")
    print_(tagged_sent)

    # tag an already tokenized sentence
    tagged_sent = tagger.tag_sentence("Do n't you want to come with me to the market ?", tokenize=False)
    print_(tagged_sent)

    # get the dependency parse of an already tagged sentence
    dep_parsed_sent = depparser.dep_parse_tagged_sentence("I/PRP am/VBP going/VBG to/TO the/DT market/NN ./.")
    print_(dep_parsed_sent)

    # get the dependency parse of an already tokenized sentence
    dep_parsed_sent = depparser.dep_parse_sentence("Do n't you want to come with me to the market ?", tokenize=False)
    print_(dep_parsed_sent)

    # get the dependency parse of an already tokenized sentence
    # and include lemma information (assuming you have NLTK as well
    # as its WordNet corpus installed)
    dep_parsed_sent = depparser.dep_parse_sentence("Do n't you want to come with me to the market ?", tokenize=False,
                                                   with_lemmas=True)

Ejemplo n.º 15

0

Mostrar archivo

def extractStockNews(stock_newsDir, symCompHash, sentNUM):
    snp_comp = symCompHash.values()
    zparModel = ZPar('english-models')
    #tagger = zparModel.get_tagger()
    depparser = zparModel.get_depparser()
    stemmer = PorterStemmer()

    dayNews = []
    for dayDir in sorted(os.listdir(stock_newsDir)):
        if len(dayDir) != 10: continue
        #if int(dayDir[-2:]) > 5: continue
        #if dayDir != "2015-04-30": continue
        newsContents = set()
        for newsfile in sorted(os.listdir(stock_newsDir + dayDir)):
            #print "##############################################################"
            content = open(stock_newsDir + dayDir + "/" + newsfile, "r").read()
            printable = set(string.printable)
            content = filter(lambda x: x in printable, content)
            #print content
            #print "##############################################################"

            #sents = get_valid_news_content(content)
            sents = get_valid_1stpara_news(content)
            if sents is None: continue
            headline = re.sub("^(rpt )?update\s*\d+\s", "",
                              "###".join(sents[:sentNUM]).lower())
            headline = re.sub("\s+", " ", headline)
            newsContents.add(headline)

        oneDayNews = []  # [(matchedSNPComp, headline), ...]
        # matchedSNPComp: [(matchedPart, WholeCompName), ...]
        fullNameNum = 0
        doubtCounter = Counter()

        if 0:
            print "\n".join(sorted(list(newsContents)))
            continue

        newsHash = {}
        headlineCompHash = {}

        for headline in newsContents:
            fullMatch = findComp_name(headline.replace("###", " "), snp_comp)
            #symMatch = [(word, symCompHash[word]) for word in headline.replace("###", " ").split() if word in symCompHash and word not in ["a", "an", "has"]]
            symMatch = [
                word for word in headline.replace("###", " ").split()
                if word in [
                    "ge", "gt", "gm", "aig", "cvs", "oi", "adm", "jpm", "twc",
                    "cvc", "se"
                ]
            ]
            symMatch = list([symCompHash[sym] for sym in set(symMatch)])
            if fullMatch is not None or len(symMatch) > 0:
                if 0:
                    print "---------------------------"
                    print fullMatch, symMatch
                    print headline
                    continue

                if fullMatch is not None:
                    symMatch.extend(fullMatch)

                headlineCompHash[headline] = symMatch

                # get valid words in headline
                parsed_sents = [
                    depparser.dep_parse_sentence(sent)
                    for sent in headline.split("###")
                ]
                triples = frmTriple(parsed_sents, None)
                triples = [
                    stemmer.stem(word) for word in triples
                    if word not in [":", "(", ")", ",", ".", "\"", "'"]
                ]
                sortedText = " ".join(sorted(triples))
                if sortedText not in newsHash:
                    newsHash[sortedText] = headline

        for impText, headline in newsHash.items():
            fullNameNum += 1
            oneDayNews.append(
                (headlineCompHash[headline], headline, impText.split()))

            #doubtMatch = [matchedComp[idx] for idx in range(len(matchedComp)) if matchScore[idx] > 0.33 and matchScore[idx] < 0.66]
            #wrongMatch = [matchedComp[idx] for idx in range(len(matchedComp)) if matchScore[idx] <= 0.33]

        #print "full", fullNameNum, len(newsContents), round(float(fullNameNum)/len(newsContents), 2)
        print "## Stock news extracting done in day", dayDir, " #snp_matched", fullNameNum, " out of all", len(
            newsContents), time.asctime()
        dayNews.append(oneDayNews)
        #break
    return dayNews

Ejemplo n.º 16

0

Mostrar archivo

Archivo: reader.py Proyecto: zhouh/CGMH

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import pickle as pkl
from config import config
config = config()
from utils import *
import sys
sys.path.insert(0, config.dict_path)
from dict_use import *
import RAKE
Rake = RAKE.Rake(RAKE.SmartStopList())
from zpar import ZPar
z = ZPar(config.pos_path)
tagger = z.get_tagger()
tt_proportion = 0.9


class dataset(object):
    def __init__(self, input, sequence_length, target):
        self.input = input
        self.target = target
        self.sequence_length = sequence_length
        self.length = len(input)

    def __call__(self, batch_size, step):
        batch_num = self.length // batch_size
        step = step % batch_num
        return self.input[step * batch_size:(step + 1) *
                          batch_size], self.sequence_length[

Ejemplo n.º 17

0

Mostrar archivo

Archivo: evalRecall.py Proyecto: qolina/DBED

def extractStockNews(stock_newsDir, symCompHash, sentNUM):
    snp_comp = symCompHash.values()
    zparModel = ZPar('english-models')
    #tagger = zparModel.get_tagger()
    depparser = zparModel.get_depparser()
    stemmer = PorterStemmer()

    dayNews = []
    for dayDir in sorted(os.listdir(stock_newsDir)):
        if len(dayDir) != 10: continue
        #if int(dayDir[-2:]) > 5: continue
        #if dayDir != "2015-04-30": continue
        newsContents = set()
        for newsfile in sorted(os.listdir(stock_newsDir + dayDir)):
            #print "##############################################################"
            content = open(stock_newsDir + dayDir + "/" + newsfile, "r").read()
            printable = set(string.printable)
            content = filter(lambda x:x in printable, content)
            #print content
            #print "##############################################################"

            #sents = get_valid_news_content(content)
            sents = get_valid_1stpara_news(content)
            if sents is None: continue
            headline = re.sub("^(rpt )?update\s*\d+\s", "", "###".join(sents[:sentNUM]).lower())
            headline = re.sub("\s+", " ", headline)
            newsContents.add(headline)

        oneDayNews = [] # [(matchedSNPComp, headline), ...]
        # matchedSNPComp: [(matchedPart, WholeCompName), ...]
        fullNameNum = 0
        doubtCounter = Counter()

        if 0:
            print "\n".join(sorted(list(newsContents)))
            continue

        newsHash = {}
        headlineCompHash = {}

        for headline in newsContents:
            fullMatch = findComp_name(headline.replace("###", " "), snp_comp)
            #symMatch = [(word, symCompHash[word]) for word in headline.replace("###", " ").split() if word in symCompHash and word not in ["a", "an", "has"]]
            symMatch = [word for word in headline.replace("###", " ").split() if word in ["ge", "gt", "gm", "aig", "cvs", "oi", "adm", "jpm", "twc", "cvc", "se"]]
            symMatch = list([symCompHash[sym] for sym in set(symMatch)])
            if fullMatch is not None or len(symMatch) > 0:
                if 0:
                    print "---------------------------"
                    print fullMatch, symMatch
                    print headline
                    continue

                if fullMatch is not None:
                    symMatch.extend(fullMatch)

                headlineCompHash[headline] = symMatch

                # get valid words in headline
                parsed_sents = [depparser.dep_parse_sentence(sent) for sent in headline.split("###")]
                triples = frmTriple(parsed_sents, None)
                triples = [stemmer.stem(word) for word in triples if word not in [":", "(", ")", ",", ".", "\"", "'"]]
                sortedText = " ".join(sorted(triples))
                if sortedText not in newsHash:
                    newsHash[sortedText] = headline

        for impText, headline in newsHash.items():
            fullNameNum += 1
            oneDayNews.append((headlineCompHash[headline], headline, impText.split()))


            #doubtMatch = [matchedComp[idx] for idx in range(len(matchedComp)) if matchScore[idx] > 0.33 and matchScore[idx] < 0.66]
            #wrongMatch = [matchedComp[idx] for idx in range(len(matchedComp)) if matchScore[idx] <= 0.33]

        #print "full", fullNameNum, len(newsContents), round(float(fullNameNum)/len(newsContents), 2)
        print "## Stock news extracting done in day", dayDir, " #snp_matched", fullNameNum, " out of all", len(newsContents), time.asctime()
        dayNews.append(oneDayNews)
        #break
    return dayNews