Beispiel #1
0
    def generate_table(self, txt):
        pageid = unicode(self.page["parse"]["pageid"])
        timestamp = self.revision["query"]["pages"][pageid]["revisions"][0][
            "timestamp"]

        table = []
        for i, v in enumerate(self.section_text):
            for j, w in enumerate(self.section_text[i]):
                for k, t in enumerate(self.section_text[i][j]):
                    sentence = {}
                    sentence["entity_id"] = self.page["parse"]["pageid"]
                    sentence["revision_id"] = self.page["parse"]["revid"]
                    sentence["timestamp"] = timestamp
                    sentence["entity_title"] = self.page["parse"]["title"]
                    sentence["section_id"] = i
                    sentence["section"] = self.section_name[i]
                    sentence["prg_idx"] = j
                    sentence["sentence_idx"] = k
                    sentence["statement"] = self.section_text[i][j][k]
                    sentence["citations"] = self.citation[i][j][k]
                    table.append(sentence)

        data = pd.DataFrame(table,
                            columns=[
                                "entity_id", "revision_id", "timestamp",
                                "entity_title", "section_id", "section",
                                "prg_idx", "sentence_idx", "statement",
                                "citations"
                            ])
        data.to_csv(txt, sep="\t", index=False, encoding="utf-8")
Beispiel #2
0
def main():
    global prd_entry, stopwords, lemmatizer, tagger
    tagger = PerceptronTagger()
    lemmatizer = WordNetLemmatizer()
    data = pandas.read_csv('data.csv')
    #removes stopwords
    stopwords = set(corpus.stopwords.words('english'))

    #reads data from data.csv
    data['product_title'] = data['product_title'].map(lambda x: clean_entry(x))
    data['product_title'] = data['product_title'].map(
        lambda x: set(word_tokenize(x)) - stopwords).map(lambda l: ' '.join(l))

    data['product_description'] = data['product_description'].map(
        lambda x: clean_entry(x))
    data['product_description'] = data['product_description'].map(
        lambda x: set(word_tokenize(x)) - stopwords).map(lambda l: ' '.join(l))

    data['search_term'] = data['search_term'].map(lambda x: clean_entry(x))
    data['search_term'] = data['search_term'].map(
        lambda x: set(word_tokenize(x)) - stopwords).map(lambda l: ' '.join(l))

    data = data.dropna()

    #writes the data to new csv file
    data.to_csv('clean_data.csv')
Beispiel #3
0
def convert_item_name(file):
    data = pd.read_csv(file)
    item_name_pattern = re.compile('[A-Z][0-9][a-z]?$')
    for i, row in data.iterrows():
        if item_name_pattern.match(row['item_name']):
            print(row['item_name'])
            new_item_name = row['item_name'][0] + '0' + row['item_name'][1:]
            data.at[i, 'item_name'] = new_item_name

    data.to_csv(file, encoding='utf-8', index=False)
Beispiel #4
0
    def predict(self,
                input_dir,
                output_dir,
                rw_type,
                input_format,
                chunk_len=100,
                test_scores=False,
                output_confidence=False,
                special_model_path=None):
        """
        tags each file in the input directory (txt or tsv files) and writes the results
        to output_dir. Also adds a folder "result_stats" with runtime information to the
        output_dir

        tsv files must have at least the columns 'tok' and 'sentstart'
        :param input_dir: string value: path to input directory
        :param output_dir: string value: path to output directory
        :param rw_type: string value: direct, indirect, freeIndirect or reported
        :param input_format: string value: txt or tsv
        :param chunk_len:
        :return:
        """
        # time the prediction
        start_time = datetime.datetime.now().replace(microsecond=0)
        # create a subdir for testing and overview information in the outputdir
        result_subdir = "result_stats"
        if not os.path.exists(os.path.join(output_dir, result_subdir)):
            os.makedirs(os.path.join(output_dir, result_subdir))

        # load the model
        # determine the current script path
        curr_path = os.path.dirname(os.path.abspath(__file__))
        if special_model_path is None:
            model_path = os.path.join(curr_path, "models", rw_type,
                                      "final-model.pt")
        else:
            model_path = os.path.join(curr_path, "models", special_model_path,
                                      "final-model.pt")
        if not os.path.exists(model_path):
            logging.warning(
                "Predicting {} aborted. Model not found at path '{}'. Please download a model and put it into "
                "the appropriate directory. The model file must be named final-model.pt."
                .format(rw_type, model_path))
        else:
            self.logger.info("loading model {}".format(model_path))
            model = SequenceTagger.load(model_path)
            self.logger.info("model loaded")

            # if test mode, collect score data (initialize in any case)
            score_dict = {"file": [], "f1": [], "precision": [], "recall": []}
            all_predictions_df = pd.DataFrame()

            input_files = [x for x in os.listdir(input_dir)]
            for file in input_files:
                resfile_name = re.sub("\..+$", ".tsv", file)
                self.logger.info("predicting {}".format(file))
                # read the file and convert to dataframe
                if input_format == "txt":
                    data = self.convert_txtfile_to_dateframe(
                        os.path.join(input_dir, file))
                else:
                    data = pd.read_csv(os.path.join(input_dir, file),
                                       sep="\t",
                                       quoting=3,
                                       encoding="utf-8",
                                       na_values=[])

                # check for tok column:
                if "tok" not in data.columns:
                    self.logger.warning(
                        "Column 'tok' is missing in file {}. File will be skipped."
                        .format(file))
                else:
                    if "sentstart" not in data.columns:
                        self.logger.warning(
                            "Column 'sentstart' is missing in file {}. Will be added with default values (all 'no')."
                            .format(file))
                        data["sentstart"] = ["no"] * len(data)

                    self.logger.debug("TEST: data head:\n {}".format(
                        data.head(10)))
                    # create sentlist (based on max chunk length)
                    sent_list = self.create_sentlist_from_file_batchmax(
                        data, maxlen=chunk_len, compare_column="NaN")
                    # predict
                    res_dict = {
                        "tok": [],
                        rw_type + "_pred": [],
                        rw_type + "_conf": []
                    }
                    for sent in sent_list:
                        model.predict(sent)
                        pred_conf_list = [
                            x["labels"]
                            for x in sent.to_dict(tag_type="cat")["entities"]
                        ]
                        pred_list = [
                            x[0].to_dict()["value"] for x in pred_conf_list
                        ]
                        conf_list = [
                            x[0].to_dict()["confidence"]
                            for x in pred_conf_list
                        ]
                        res_dict["tok"].extend([
                            x["text"]
                            for x in sent.to_dict(tag_type="cat")["entities"]
                        ])
                        res_dict[rw_type + "_conf"].extend(conf_list)
                        res_dict[rw_type + "_pred"].extend(pred_list)
                    pred_df = pd.DataFrame(res_dict)
                    # create output
                    # if there is a missmatch in file length after prediction, still save the results
                    if (len(data) != len(pred_df)):
                        self.logger.warning(
                            "File length changed when predicting for file {} (before: {}, after: {})\n"
                            "Result file will be saved with prefix 'warn_'; additional columns are lost."
                            .format(file, len(data), len(pred_df)))
                        pred_df.to_csv(os.path.join(output_dir,
                                                    "warn_" + resfile_name),
                                       index=False,
                                       sep="\t")
                    # if everything is okay, add the new column(s) to the original data and save
                    else:
                        if output_confidence:
                            data[rw_type + "_conf"] = pred_df[rw_type +
                                                              "_conf"]
                        data[rw_type + "_pred"] = pred_df[rw_type + "_pred"]
                        data.to_csv(os.path.join(output_dir, resfile_name),
                                    index=False,
                                    sep="\t",
                                    encoding="utf-8")
                        # calculate the testscores:
                        if test_scores:
                            self.logger.info(
                                "Calculate scores for {}".format(file))
                            if rw_type in data.columns and rw_type + "_pred" in data.columns:
                                data, f1, prec, rec = self.calculate_scores(
                                    data, rw_type)
                                score_dict["file"].append(file)
                                score_dict["f1"].append(f1)
                                score_dict["precision"].append(prec)
                                score_dict["recall"].append(rec)
                                all_predictions_df = all_predictions_df.append(
                                    data)
                            else:
                                self.logger.warning(
                                    "Skipping test scores for file {}: Missing column {} and/or {}"
                                    .format(file, rw_type, rw_type + "_pred"))

            end_time = datetime.datetime.now().replace(microsecond=0)

            # write an overview file when the process is finished
            res_text = "RW Tagger (predict): Model {}\n" \
                       "Predict time:\nstart: {}nend:{}\ntotal: {}" \
                .format(model_path, start_time, end_time, end_time - start_time)
            # if in test mode, calculate the final scores (for all the data) and save the test score df
            if test_scores:
                self.logger.info("Calculate total scores")
                if len(all_predictions_df) > 0:
                    self.logger.debug("all_predictions_len: {}".format(
                        len(all_predictions_df)))
                    all_predictions_df, f1, prec, rec = self.calculate_scores(
                        all_predictions_df, rw_type)
                    score_dict["file"].append("total")
                    score_dict["f1"].append(f1)
                    score_dict["precision"].append(prec)
                    score_dict["recall"].append(rec)
                    score_df = pd.DataFrame(score_dict)
                    score_df.to_csv(os.path.join(output_dir, result_subdir,
                                                 rw_type + "_test_scores.tsv"),
                                    index=False,
                                    sep="\t",
                                    encoding="utf-8")
                    res_text += "\nTotal test scores (for detailed scores see {}_test_scores.tsv):\n" \
                                "f1: {}, precision: {}, recall: {}".format(rw_type, f1, prec, rec)
                    self.logger.info(
                        "Total scores for {}: f1: {}, precision: {}, recall: {}"
                        .format(rw_type, f1, prec, rec))
            with open(os.path.join(output_dir, result_subdir,
                                   rw_type + "_overview.txt"),
                      "w",
                      encoding="utf-8") as f:
                f.write(res_text)
import re
from nltk.util import ngrams
from nltk.corpus import PlaintextCorpusReader, stopwords
stops = stopwords.words('english') + ['thou','thy']
#get the list of directories
dirs = glob('*/')
punctuation = re.compile(r'[\W]')
punct = [',','.','&','{','}','?',"'",'-',';',':','|','(',')','[',']']
#for everything in the list
names=dict()
for d in dirs:
    #make each of the directories a temporary corpus
    #also make a list of all the n-grams for 2<=n<=5, for common phrasings
    name = d[:-1]
    words = [list() for i in range(2,5)]
    with cd(d):
        corpus = PlaintextCorpusReader('./','.*')
        tokens =[token for token in corpus.words() if not punctuation.match(token)]
        grams = list(ngrams(tokens,2))
        n=len(grams)
        gramscount= [(j,grams.count(j)/n) for j in [('your','self'),('our','selfe'),('him','self')]]
        stuff = corpus.words()
        morphemes = [l for l in stuff if l not in stops and l not in punct]
        #make a pseudo-dictionary for the single words
        n=len(morphemes)
        series = [(l,morphemes.count(l)/n) for l in ['have','haue','good','goode','never','neuer','come','cum','unto','vnto','which','whych','kyng','king','kynge','verbe','verb','whiche','hath','has','lorde','lord','yourself','ourself','himself']]
        series = dict(series+gramscount)
    names[name]=series
data = pd.DataFrame(names)
data.to_csv(path_or_buf='spellingData.csv')
l = []
tags = []

for i in range(len(data['sentence'])):
    text = data['sentence'][i]
    term1 = data['term1'][i]
    term2 = data['term2'][i]
    relation = data['relation'][i]
    tags = tag_set(text, term1, term2, relation)
    print(i)


    custom_sentence_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
    tokenized = custom_sentence_tokenizer.tokenize(text)
    
    k = 0
    word = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(word)
    
    for j in tagged:
        w, pos = j
        s = 'Sentence: '+str(i)
        l.append([s, w, pos, tags[k]])
        k +=1


columns= ['Sentence #', 'Word','POS', 'Tag']
data = pd.DataFrame(l, columns=columns)

data.to_csv('crf_dataset.csv', encoding='UTF-8', index=False)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens


#..................................................................................................................
# Starts from here

data = pd.read_csv("path to dataset")
datas = data.values
aa = np.array(datas)
data = pd.DataFrame(aa)
data[1] = pd.to_datetime(data[1])  # sorting data w.r.t date
data = data.sort_values(1)
data.to_csv('new_file.csv', index=False)
f = open('new_file.csv')
path = os.path.realpath(f.name)
quar = pd.read_csv(path)
quar1 = quar.values
qq = np.array(quar1)
quar = pd.DataFrame(qq)
for p in quar[1]:
    l = p.split('-')
    a = int(l[0])
    b = int(l[1])
    c = int(l[2])
    q.append(pd.Timestamp(dt.date(
        a, b, c)).quarter)  # Finding quarter of each date in date field
quar['5'] = q
quar.to_csv('new_file1.csv', index=False)